[ { "id": "B1-Hhnslg", "title": "Prototypical Networks for Few-shot Learning", "track": "main", "status": "Reject", "tldr": "We learn a metric space in which few-shot classification can be performed by computing Euclidean distances to a single prototype representative of each class.", "abstract": "A recent approach to few-shot classification called matching networks has demonstrated the benefits of coupling metric learning with a training procedure that mimics test. This approach relies on a complicated fine-tuning procedure and an attention scheme that forms a distribution over all points in the support set, scaling poorly with its size. We propose a more streamlined approach, prototypical networks, that learns a metric space in which few-shot classification can be performed by computing Euclidean distances to prototype representations of each class, rather than individual points. Our method is competitive with state-of-the-art one-shot classification approaches while being much simpler and more scalable with the size of the support set. We empirically demonstrate the performance of our approach on the Omniglot and mini-ImageNet datasets. We further demonstrate that a similar idea can be used for zero-shot learning, where each class is described by a set of attributes, and achieve state-of-the-art results on the Caltech UCSD bird dataset.", "keywords": "Deep learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Jake Snell;Kevin Swersky;Richard Zemel", "authorids": "jsnell@cs.toronto.edu;kswersky@twitter.com;zemel@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsnell2017prototypical,\ntitle={Prototypical Networks for Few-shot Learning},\nauthor={Jake Snell and Kevin Swersky and Richard Zemel},\nyear={2017},\nurl={https://openreview.net/forum?id=B1-Hhnslg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1-Hhnslg", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;3;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 10893, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8721743270682962846&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13 }, { "id": "B1-q5Pqxl", "title": "Machine Comprehension Using Match-LSTM and Answer Pointer", "track": "main", "status": "Poster", "tldr": "Using Match-LSTM and Answer Pointer to select a variable length answer from a paragraph", "abstract": "Machine comprehension of text is an important problem in natural language processing. A recently released dataset, the Stanford Question Answering Dataset (SQuAD), offers a large number of real questions and their answers created by humans through crowdsourcing. SQuAD provides a challenging testbed for evaluating machine comprehension algorithms, partly because compared with previous datasets, in SQuAD the answers do not come from a small set of candidate answers and they have variable lengths. We propose an end-to-end neural architecture for the task. The architecture is based on match-LSTM, a model we proposed previously for textual entailment, and Pointer Net, a sequence-to-sequence model proposed by Vinyals et al. (2015) to constrain the output tokens to be from the input sequences. We propose two ways of using Pointer Net for our tasks. Our experiments show that both of our two models substantially outperform the best results obtained by Rajpurkar et al. (2016) using logistic regression and manually crafted features. Besides, our boundary model also achieves the best performance on the MSMARCO dataset (Nguyen et al. 2016).", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Shuohang Wang;Jing Jiang", "authorids": "shwang.2014@phdis.smu.edu.sg;jingjiang@smu.edu.sg", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nwang2017machine,\ntitle={Machine Comprehension Using Match-{LSTM} and Answer Pointer},\nauthor={Shuohang Wang and Jing Jiang},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1-q5Pqxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1-q5Pqxl", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 18, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 700, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8536399820764102165&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "B16Jem9xe", "title": "Learning in Implicit Generative Models", "track": "main", "status": "Workshop", "tldr": "Showing connections between GANs, ABC, ratio estimation and other approaches for learning in deep generative models.", "abstract": "Generative adversarial networks (GANs) provide an algorithmic framework for constructing generative models with several appealing properties: they do not require a likelihood function to be specified, only a generating procedure; they provide samples that are sharp and compelling; and they allow us to harness our knowledge of building highly accurate neural network classifiers. Here, we develop our understanding of GANs with the aim of forming a rich view of this growing area of machine learning---to build connections to the diverse set of statistical thinking on this topic, of which much can be gained by a mutual exchange of ideas. We frame GANs within the wider landscape of algorithms for learning in implicit generative models---models that only specify a stochastic procedure with which to generate data---and relate these ideas to modelling problems in related fields, such as econometrics and approximate Bayesian computation. We develop likelihood-free inference methods and highlight hypothesis testing as a principle for learning in implicit generative models, using which we are able to derive the objective function used by GANs, and many other related objectives. The testing viewpoint directs our focus to the general problem of density ratio estimation. There are four approaches for density ratio estimation, one of which is a solution using classifiers to distinguish real from generated data. Other approaches such as divergence minimisation and moment matching have also been explored in the GAN literature, and we synthesise these views to form an understanding in terms of the relationships between them and the wider literature, highlighting avenues for future exploration and cross-pollination.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Shakir Mohamed;Balaji Lakshminarayanan", "authorids": "shakir@google.com;balajiln@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=B16Jem9xe", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;3;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 518, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15485648899220126725&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "B16dGcqlx", "title": "Third Person Imitation Learning", "track": "main", "status": "Poster", "tldr": "Agent watches another agent at a different camera angle completing the task and learns via raw pixels how to imitate. ", "abstract": "Reinforcement learning (RL) makes it possible to train agents capable of achieving\nsophisticated goals in complex and uncertain environments. A key difficulty in\nreinforcement learning is specifying a reward function for the agent to optimize.\nTraditionally, imitation learning in RL has been used to overcome this problem.\nUnfortunately, hitherto imitation learning methods tend to require that demonstrations\nare supplied in the first-person: the agent is provided with a sequence of\nstates and a specification of the actions that it should have taken. While powerful,\nthis kind of imitation learning is limited by the relatively hard problem of collecting\nfirst-person demonstrations. Humans address this problem by learning from\nthird-person demonstrations: they observe other humans perform tasks, infer the\ntask, and accomplish the same task themselves.\nIn this paper, we present a method for unsupervised third-person imitation learning.\nHere third-person refers to training an agent to correctly achieve a simple\ngoal in a simple environment when it is provided a demonstration of a teacher\nachieving the same goal but from a different viewpoint; and unsupervised refers\nto the fact that the agent receives only these third-person demonstrations, and is\nnot provided a correspondence between teacher states and student states. Our\nmethods primary insight is that recent advances from domain confusion can be\nutilized to yield domain agnostic features which are crucial during the training\nprocess. To validate our approach, we report successful experiments on learning\nfrom third-person demonstrations in a pointmass domain, a reacher domain, and\ninverted pendulum.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bradly C Stadie;Pieter Abbeel;Ilya Sutskever", "authorids": "bstadie@openai.com;pieter@openai.com;ilyasu@openai.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nstadie2017third,\ntitle={Third Person Imitation Learning},\nauthor={Bradly C Stadie and Pieter Abbeel and Ilya Sutskever},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B16dGcqlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer5", "site": "https://openreview.net/forum?id=B16dGcqlx", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 285, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16810145848030531947&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "B184E5qee", "title": "Improving Neural Language Models with a Continuous Cache", "track": "main", "status": "Poster", "tldr": "", "abstract": "We propose an extension to neural network language models to adapt their prediction to the recent history. Our model is a simplified version of memory augmented networks, which stores past hidden activations as memory and accesses them through a dot product with the current hidden activation. This mechanism is very efficient and scales to very large memory sizes. We also draw a link between the use of external memory in neural network and cache models used with count based language models. We demonstrate on several language model datasets that our approach performs significantly better than recent memory augmented networks.", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Edouard Grave;Armand Joulin;Nicolas Usunier", "authorids": "egrave@fb.com;ajoulin@fb.com;usunier@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngrave2017improving,\ntitle={Improving Neural Language Models with a Continuous Cache},\nauthor={Edouard Grave and Armand Joulin and Nicolas Usunier},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B184E5qee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B184E5qee", "pdf_size": 0, "rating": "5;7;9", "confidence": "4;5;5", "rating_avg": 7.0, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 357, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3789899360774374068&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "B186cP9gx", "title": "Eigenvalues of the Hessian in Deep Learning: Singularity and Beyond", "track": "main", "status": "Reject", "tldr": "The eigenvalues of the Hessian of loss functions in deep learning have two components: singular bulk at zero that depends on the over-parametrization, and the discrete part that depends on the data.", "abstract": "We look at the eigenvalues of the Hessian of a loss function before and after training. The eigenvalue distribution is seen to be composed of two parts, the bulk which is concentrated around zero, and the edges which are scattered away from zero. We present empirical evidence for the bulk indicating how over-parametrized the system is, and for the edges that depend on the input data.", "keywords": "Optimization;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Levent Sagun;Leon Bottou;Yann LeCun", "authorids": "leventsagun@gmail.com;leon@bottou.org;yann@cs.nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsagun2017eigenvalues,\ntitle={Eigenvalues of the Hessian in Deep Learning: Singularity and Beyond},\nauthor={Levent Sagun and Leon Bottou and Yann LeCun},\nyear={2017},\nurl={https://openreview.net/forum?id=B186cP9gx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B186cP9gx", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3290490945331448681&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1E7Pwqgl", "title": "Cooperative Training of Descriptor and Generator Networks", "track": "main", "status": "Reject", "tldr": "Cooperative training of the descriptor and generator networks by coupling two maximum likelihood learning algorithms.", "abstract": "This paper studies the cooperative training of two probabilistic models of signals such as images. Both models are parametrized by convolutional neural networks (ConvNets). The first network is a descriptor network, which is an exponential family model or an energy-based model, whose feature statistics or energy function are defined by a bottom-up ConvNet, which maps the observed signal to the feature statistics. The second network is a generator network, which is a non-linear version of factor analysis. It is defined by a top-down ConvNet, which maps the latent factors to the observed signal. The maximum likelihood training algorithms of both the descriptor net and the generator net are in the form of alternating back-propagation, and both algorithms involve Langevin sampling. We observe that the two training algorithms can cooperate with each other by jump-starting each other\u2019s Langevin sampling, and they can be seamlessly interwoven into a CoopNets algorithm that can train both nets simultaneously.", "keywords": "Unsupervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Jianwen Xie;Yang Lu;Ruiqi Gao;Song-Chun Zhu;Ying Nian Wu", "authorids": "jianwen@ucla.edu;yanglv@ucla.edu;ruiqigao@ucla.edu;sczhu@stat.ucla.edu;ywu@stat.ucla.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nxie2017cooperative,\ntitle={Cooperative Training of Descriptor and Generator Networks},\nauthor={Jianwen Xie and Yang Lu and Ruiqi Gao and Song-Chun Zhu and Ying Nian Wu},\nyear={2017},\nurl={https://openreview.net/forum?id=B1E7Pwqgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1E7Pwqgl", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.18898223650461363, "gs_citation": 170, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18202808849093155435&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "B1ElR4cgg", "title": "Adversarially Learned Inference", "track": "main", "status": "Poster", "tldr": "We present and adverserially trained generative model with an inference network. Samples quality is high. Competitive semi-supervised results are achieved.", "abstract": "We introduce the adversarially learned inference (ALI) model, which jointly\nlearns a generation network and an inference network using an adversarial\nprocess. The generation network maps samples from stochastic latent variables to\nthe data space while the inference network maps training examples in data space\nto the space of latent variables. An adversarial game is cast between these two\nnetworks and a discriminative network that is trained to distinguish between\njoint latent/data-space samples from the generative network and joint samples\nfrom the inference network. We illustrate the ability of the model to learn\nmutually coherent inference and generation networks through the inspections of\nmodel samples and reconstructions and confirm the usefulness of the learned\nrepresentations by obtaining a performance competitive with other recent\napproaches on the semi-supervised SVHN task.", "keywords": "Computer vision;Deep learning;Unsupervised Learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Vincent Dumoulin;Ishmael Belghazi;Ben Poole;Alex Lamb;Martin Arjovsky;Olivier Mastropietro;Aaron Courville", "authorids": "vincent.dumoulin@umontreal.ca;ishmael.belghazi@gmail.com;poole@cs.stanford.edu;alex6200@gmail.com;martinarjovsky@gmail.com;oli.mastro@gmail.com;aaron.courville@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\ndumoulin2017adversarially,\ntitle={Adversarially Learned Inference},\nauthor={Vincent Dumoulin and Ishmael Belghazi and Ben Poole and Alex Lamb and Martin Arjovsky and Olivier Mastropietro and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1ElR4cgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1ElR4cgg", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 7, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1907, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10451598130846693107&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "B1G9tvcgx", "title": "Neural Machine Translation with Latent Semantic of Image and Text", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although attention-based Neural Machine Translation have achieved great success, attention-mechanism cannot capture the entire meaning of the source sentence because the attention mechanism generates a target word depending heavily on the relevant parts of the source sentence. The report of earlier studies has introduced a latent variable to capture the entire meaning of sentence and achieved improvement on attention-based Neural Machine Translation. We follow this approach and we believe that the capturing meaning of sentence benefits from image information because human beings understand the meaning of language not only from textual information but also from perceptual information such as that gained from vision. As described herein, we propose a neural machine translation model that introduces a continuous latent variable containing an underlying semantic extracted from texts and images. Our model, which can be trained end-to-end, requires image information only when training. Experiments conducted with an English\u2013German translation task show that our model outperforms over the baseline.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joji Toyama;Masanori Misono;Masahiro Suzuki;Kotaro Nakayama;Yutaka Matsuo", "authorids": "toyama@weblab.t.u-tokyo.ac.jp;misono@weblab.t.u-tokyo.ac.jp;masa@weblab.t.u-tokyo.ac.jp;k-nakayama@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntoyama2017neural,\ntitle={Neural Machine Translation with Latent Semantic of Image and Text},\nauthor={Joji Toyama and Masanori Misono and Masahiro Suzuki and Kotaro Nakayama and Yutaka Matsuo},\nyear={2017},\nurl={https://openreview.net/forum?id=B1G9tvcgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1G9tvcgx", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;5", "rating_avg": 3.3333333333333335, "confidence_avg": 4.666666666666667, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7609938886149997407&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "B1GOWV5eg", "title": "Learning to Repeat: Fine Grained Action Repetition for Deep Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "Framework for temporal abstractions in policy space by learning to repeat actions", "abstract": "Reinforcement Learning algorithms can learn complex behavioral patterns for sequential decision making tasks wherein an agent interacts with an environment and acquires feedback in the form of rewards sampled from it. Traditionally, such algorithms make decisions, i.e., select actions to execute, at every single time step of the agent-environment interactions. In this paper, we propose a novel framework, Fine Grained Action Repetition (FiGAR), which enables the agent to decide the action as well as the time scale of repeating it.\nFiGAR can be used for improving any Deep Reinforcement Learning algorithm which maintains an explicit policy estimate by enabling temporal abstractions in the action space and implicitly enabling planning through sequences of repetitive macro-actions. \nWe empirically demonstrate the efficacy of our framework by showing performance improvements on top of three policy search algorithms in different domains: Asynchronous Advantage Actor Critic in the Atari 2600 domain, Trust Region Policy Optimization in Mujoco domain and Deep Deterministic Policy Gradients in the TORCS car racing domain.\n", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Sahil Sharma;Aravind S. Lakshminarayanan;Balaraman Ravindran", "authorids": "ssahil08@gmail.com;aravindsrinivas@gmail.com;ravi@cse.iitm.ac.in", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsharma2017learning,\ntitle={Learning to Repeat: Fine Grained Action Repetition for Deep Reinforcement Learning},\nauthor={Sahil Sharma and Aravind S. Lakshminarayanan and Balaraman Ravindran},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1GOWV5eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=B1GOWV5eg", "pdf_size": 0, "rating": "7;8;8", "confidence": "3;5;4", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16447933204492604054&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "B1Igu2ogg", "title": "Efficient Vector Representation for Documents through Corruption", "track": "main", "status": "Poster", "tldr": "a simple document representation learning framework that is very efficient to train and test", "abstract": "We present an efficient document representation learning framework, Document Vector through Corruption (Doc2VecC). Doc2VecC represents each document as a simple average of word embeddings. It ensures a representation generated as such captures the semantic meanings of the document during learning. A corruption model is included, which introduces a data-dependent regularization that favors informative or rare words while forcing the embeddings of common and non-discriminative ones to be close to zero. Doc2VecC produces significantly better word embeddings than Word2Vec. We compare Doc2VecC with several state-of-the-art document representation learning algorithms. The simple model architecture introduced by Doc2VecC matches or out-performs the state-of-the-art in generating high-quality document representations for sentiment analysis, document classification as well as semantic relatedness tasks. The simplicity of the model enables training on billions of words per hour on a single machine. At the same time, the model is very efficient in generating representations of unseen documents at test time.\n", "keywords": "Natural language processing;Deep learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Minmin Chen", "authorids": "m.chen@criteo.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nchen2017efficient,\ntitle={Efficient Vector Representation for Documents through Corruption},\nauthor={Minmin Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1Igu2ogg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1Igu2ogg", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 20, "authors#_avg": 1, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 179, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=746730949769495426&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1IzH7cxl", "title": "A Neural Stochastic Volatility Model", "track": "main", "status": "Reject", "tldr": "A novel integration of statistical models with recurrent neural networks providing a new way of formulating volatility models.", "abstract": "In this paper, we show that the recent integration of statistical models with recurrent neural networks provides a new way of formulating volatility models that have been popular in time series analysis and prediction. The model comprises a pair of complementary stochastic recurrent neural networks: the generative network models the joint distribution of the stochastic volatility process; the inference network approximates the conditional distribution of the latent variables given the observable ones.\nOur focus in this paper is on the formulation of temporal dynamics of volatility over time under a stochastic recurrent neural network framework. Our derivations show that some popular volatility models are a special case of our proposed neural stochastic volatility model. Experiments demonstrate that the proposed model generates a smoother volatility estimation, and largely outperforms a widely used GARCH model on several metrics about the fitness of the volatility modelling and the accuracy of the prediction.", "keywords": "Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Rui Luo;Xiaojun Xu;Weinan Zhang;Jun Wang", "authorids": "r.luo@cs.ucl.ac.uk;xuxj@apex.sjtu.edu.cn;wnzhang@apex.sjtu.edu.cn;j.wang@cs.ucl.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nluo2017a,\ntitle={A Neural Stochastic Volatility Model},\nauthor={Rui Luo and Xiaojun Xu and Weinan Zhang and Jun Wang},\nyear={2017},\nurl={https://openreview.net/forum?id=B1IzH7cxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1IzH7cxl", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2417562945828597065&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "B1KBHtcel", "title": "Here's My Point: Argumentation Mining with Pointer Networks", "track": "main", "status": "Reject", "tldr": "We use a modified Pointer Network to predict 1) types of argument components; 2) links between argument components.", "abstract": "One of the major goals in automated argumentation mining is to uncover the argument structure present in argumentative text. In order to determine this structure, one must understand how different individual components of the overall argument are linked. General consensus in this field dictates that the argument components form a hierarchy of persuasion, which manifests itself in a tree structure. This work provides the first neural network-based approach to argumentation mining, focusing on extracting links between argument components, with a secondary focus on classifying types of argument components. In order to solve this problem, we propose to use a modification of a Pointer Network architecture. A Pointer Network is appealing for this task for the following reasons: 1) It takes into account the sequential nature of argument components; 2) By construction, it enforces certain properties of the tree structure present in argument relations; 3) The hidden representations can be applied to auxiliary tasks. In order to extend the contribution of the original Pointer Network model, we construct a joint model that simultaneously attempts to learn the type of argument component, as well as continuing to predict links between argument components. The proposed model achieves state-of-the-art results on two separate evaluation corpora. Furthermore, our results show that optimizing for both tasks, as well as adding a fully-connected layer prior to recurrent neural network input, is crucial for high performance.", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Peter Potash;Alexey Romanov;Anna Rumshisky", "authorids": "ppotash@cs.uml.edu;aromanov@cs.uml.edu;arum@cs.uml.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npotash2017heres,\ntitle={Here's My Point: Argumentation Mining with Pointer Networks},\nauthor={Peter Potash and Alexey Romanov and Anna Rumshisky},\nyear={2017},\nurl={https://openreview.net/forum?id=B1KBHtcel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1KBHtcel", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18160774140129920489&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1M8JF9xx", "title": "On the Quantitative Analysis of Decoder-Based Generative Models", "track": "main", "status": "Poster", "tldr": "We propose to use Annealed Importance Sampling to evaluate decoder-based generative network, and investigate various properties of these models.", "abstract": "The past several years have seen remarkable progress in generative models which produce convincing samples of images and other modalities. A shared component of some popular models such as generative adversarial networks and generative moment matching networks, is a decoder network, a parametric deep neural net that defines a generative distribution. Unfortunately, it can be difficult to quantify the performance of these models because of the intractability of log-likelihood estimation, and inspecting samples can be misleading. We propose to use Annealed Importance Sampling for evaluating log-likelihoods for decoder-based models and validate its accuracy using bidirectional Monte Carlo. Using this technique, we analyze the performance of decoder-based models, the effectiveness of existing log-likelihood estimators, the degree of overfitting, and the degree to which these models miss important modes of the data distribution.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Yuhuai Wu;Yuri Burda;Ruslan Salakhutdinov;Roger Grosse", "authorids": "ywu@cs.toronto.edu;yburda@openai.com;rsalakhu@cs.cmu.edu;rgrosse@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwu2017on,\ntitle={On the Quantitative Analysis of Decoder-Based Generative Models},\nauthor={Yuhuai Wu and Yuri Burda and Ruslan Salakhutdinov and Roger Grosse},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1M8JF9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1M8JF9xx", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 18, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 285, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5247073711186247499&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "B1MRcPclx", "title": "Query-Reduction Networks for Question Answering", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this paper, we study the problem of question answering when reasoning over multiple facts is required. We propose Query-Reduction Network (QRN), a variant of Recurrent Neural Network (RNN) that effectively handles both short-term (local) and long-term (global) sequential dependencies to reason over multiple facts. QRN considers the context sentences as a sequence of state-changing triggers, and reduces the original query to a more informed query as it observes each trigger (context sentence) through time. Our experiments show that QRN produces the state-of-the-art results in bAbI QA and dialog tasks, and in a real goal-oriented dialog dataset. In addition, QRN formulation allows parallelization on RNN's time axis, saving an order of magnitude in time complexity for training and inference. ", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Minjoon Seo;Sewon Min;Ali Farhadi;Hannaneh Hajishirzi", "authorids": "minjoon@cs.washington.edu;shmsw25@snu.ac.kr;ali@cs.washington.edu;hannaneh@cs.washington.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nseo2017queryreduction,\ntitle={Query-Reduction Networks for Question Answering},\nauthor={Minjoon Seo and Sewon Min and Ali Farhadi and Hannaneh Hajishirzi},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1MRcPclx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1MRcPclx", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 17, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 114, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7146143453708893892&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "B1PA8fqeg", "title": "Multiagent System for Layer Free Network", "track": "main", "status": "Reject", "tldr": "We propose a multiagent system that have feed-forward networks as its subset but free from layer scheme.", "abstract": "We propose a multiagent system that have feedforward networks as its subset \nwhile free from layer structure with matrix-vector scheme.\nDeep networks are often compared to the brain neocortex or visual perception system.\nOne of the largest difference from human brain is the use of matrix-vector multiplication based on layer architecture.\nIt would help understanding the way human brain works\nif we manage to develop good deep network model without the layer architecture while preserving their performance.\nThe brain neocortex works as an aggregation of the local level interactions between neurons, \nwhich is rather similar to multiagent system consists of autonomous partially observing agents\nthan units aligned in column vectors and manipulated by global level algorithm.\nTherefore we suppose that it is an effective approach for developing more biologically plausible model while preserving compatibility with deep networks to alternate units with multiple agents.\nOur method also has advantage in scalability and memory efficiency.\nWe reimplemented Stacked Denoising Autoencoder(SDAE) as a concrete instance with our multiagent system and verified its equivalence with the standard SDAE from both theoritical and empirical perspectives.\nAdditionary, we also proposed a variant of our multiagent SDAE named \"Sparse Connect SDAE\",\nand showed its computational advantage with the MNIST dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hiroki Kurotaki;Kotaro Nakayama;Yutaka Matsuo", "authorids": "kurotaki@weblab.t.u-tokyo.ac.jp;nakayama@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkurotaki2017multiagent,\ntitle={Multiagent System for Layer Free Network},\nauthor={Hiroki Kurotaki and Kotaro Nakayama and Yutaka Matsuo},\nyear={2017},\nurl={https://openreview.net/forum?id=B1PA8fqeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1PA8fqeg", "pdf_size": 0, "rating": "1;2;3", "confidence": "5;5;3", "rating_avg": 2.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ihvdQB7u9uEJ:scholar.google.com/&scioq=Multiagent+System+for+Layer+Free+Network&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1TTpYKgx", "title": "On the Expressive Power of Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "Derives and explains the exponential depth sensitivity of different expressivity measures for deep neural networks, and explores consequences during and after training. ", "abstract": "We study the expressive power of deep neural networks before and after\ntraining. Considering neural nets after random initialization, we show that\nthree natural measures of expressivity all display an exponential dependence\non the depth of the network. We prove, theoretically and experimentally,\nthat all of these measures are in fact related to a fourth quantity, trajectory\nlength. This quantity grows exponentially in the depth of the network, and\nis responsible for the depth sensitivity observed. These results translate\nto consequences for networks during and after training. They suggest that\nparameters earlier in a network have greater influence on its expressive power\n\u2013 in particular, given a layer, its influence on expressivity is determined by\nthe remaining depth of the network after that layer. This is verified with\nexperiments on MNIST and CIFAR-10. We also explore the effect of training\non the input-output map, and find that it trades off between the stability\nand expressivity of the input-output map.", "keywords": "Theory;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Maithra Raghu;Ben Poole;Jon Kleinberg;Surya Ganguli;Jascha Sohl-Dickstein", "authorids": "maithrar@gmail.com;benmpoole@gmail.com;kleinber@cs.cornell.edu;sganguli@stanford.edu;jaschasd@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nraghu2017on,\ntitle={On the Expressive Power of Deep Neural Networks},\nauthor={Maithra Raghu and Ben Poole and Jon Kleinberg and Surya Ganguli and Jascha Sohl-Dickstein},\nyear={2017},\nurl={https://openreview.net/forum?id=B1TTpYKgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1TTpYKgx", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;3;5", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": 0.7559289460184546, "gs_citation": 1054, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6162680448928462350&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "B1YfAfcgl", "title": "Entropy-SGD: Biasing Gradient Descent Into Wide Valleys", "track": "main", "status": "Poster", "tldr": "This paper focuses on developing new optimization tools for deep learning that are tailored to exploit the local geometric properties of the objective function.", "abstract": "This paper proposes a new optimization algorithm called Entropy-SGD for training deep neural networks that is motivated by the local geometry of the energy landscape. Local extrema with low generalization error have a large proportion of almost-zero eigenvalues in the Hessian with very few positive or negative eigenvalues. We leverage upon this observation to construct a local-entropy-based objective function that favors well-generalizable solutions lying in large flat regions of the energy landscape, while avoiding poorly-generalizable solutions located in the sharp valleys. Conceptually, our algorithm resembles two nested loops of SGD where we use Langevin dynamics in the inner loop to compute the gradient of the local entropy before each update of the weights. We show that the new objective has a smoother energy landscape and show improved generalization over SGD using uniform stability, under certain assumptions. Our experiments on convolutional and recurrent neural networks demonstrate that Entropy-SGD compares favorably to state-of-the-art techniques in terms of generalization error and training time.", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Pratik Chaudhari;Anna Choromanska;Stefano Soatto;Yann LeCun;Carlo Baldassi;Christian Borgs;Jennifer Chayes;Levent Sagun;Riccardo Zecchina", "authorids": "pratikac@ucla.edu;achoroma@cims.nyu.edu;soatto@cs.ucla.edu;yann@cs.nyu.edu;carlo.baldassi@polito.it;borgs@microsoft.com;jchayes@microsoft.com;sagun@cims.nyu.edu;riccardo.zecchina@polito.it", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nchaudhari2017entropysgd,\ntitle={Entropy-{SGD}: Biasing Gradient Descent Into Wide Valleys},\nauthor={Pratik Chaudhari and Anna Choromanska and Stefano Soatto and Yann LeCun and Carlo Baldassi and Christian Borgs and Jennifer Chayes and Levent Sagun and Riccardo Zecchina},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1YfAfcgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=B1YfAfcgl", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;4;3", "rating_avg": 8.0, "confidence_avg": 3.6666666666666665, "replies_avg": 33, "authors#_avg": 9, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 899, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=496423886429566828&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14 }, { "id": "B1ZXuTolx", "title": "Revisiting Denoising Auto-Encoders", "track": "main", "status": "Reject", "tldr": "Modified objective for denoising autoencoders with explicit robustness in the encoding.", "abstract": "Denoising auto-encoders (DAE)s were proposed as a simple yet powerful way to obtain representations in an unsupervised manner by learning a map that approximates the clean inputs from their corrupted versions. However, the original objective function proposed for DAEs does not guarantee that denoising happens only at the encoding stages. We argue that a better representation can be obtained if the encoder is forced to carry out most of the denoising effort. Here, we propose a simple modification to the DAE's objective function that accomplishes the above goal. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luis Gonzalo Sanchez Giraldo", "authorids": "lgsanchez@cs.miami.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ngiraldo2017revisiting,\ntitle={Revisiting Denoising Auto-Encoders},\nauthor={Luis Gonzalo Sanchez Giraldo},\nyear={2017},\nurl={https://openreview.net/forum?id=B1ZXuTolx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1ZXuTolx", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 1, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=199216486483509207&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1akgy9xx", "title": "Making Stochastic Neural Networks from Deterministic Ones", "track": "main", "status": "Reject", "tldr": "", "abstract": "It has been believed that stochastic feedforward neural networks (SFNN) have several advantages beyond deterministic deep neural networks (DNN): they have more expressive power allowing multi-modal mappings and regularize better due to their stochastic nature. However, training SFNN is notoriously harder. In this paper, we aim at developing efficient training methods for large-scale SFNN, in particular using known architectures and pre-trained parameters of DNN. To this end, we propose a new intermediate stochastic model, called Simplified-SFNN, which can be built upon any baseline DNN and approximates certain SFNN by simplifying its upper latent units above stochastic ones. The main novelty of our approach is in establishing the connection between three models, i.e., DNN\n-> Simplified-SFNN -> SFNN, which naturally leads to an efficient training procedure of the stochastic models utilizing pre-trained parameters of DNN. Using several popular DNNs, we show how they can be effectively transferred to the corresponding stochastic models for both multi-modal and classification tasks on MNIST, TFD, CIFAR-10, CIFAR-100 and SVHN datasets. In particular, our stochastic model built from the wide residual network has 28 layers and 36 million parameters, where the former consistently outperforms the latter for the classification tasks on CIFAR-10 and CIFAR-100 due to its stochastic regularizing effect.", "keywords": "Deep learning;Multi-modal learning;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Kimin Lee;Jaehyung Kim;Song Chong;Jinwoo Shin", "authorids": "kiminlee@kaist.ac.kr;jaehyungkim@kaist.ac.kr;songchong@kaist.edu;jinwoos@kaist.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1akgy9xx", "pdf_size": 0, "rating": "5;6", "confidence": "5;4", "rating_avg": 5.5, "confidence_avg": 4.5, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999999, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9240293623913713661&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1ckMDqlg", "title": "Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer", "track": "main", "status": "Poster", "tldr": "", "abstract": "The capacity of a neural network to absorb information is limited by its number of parameters. Conditional computation, where parts of the network are active on a per-example basis, has been proposed in theory as a way of dramatically increasing model capacity without a proportional increase in computation. In practice, however, there are significant algorithmic and performance challenges. In this work, we address these challenges and finally realize the promise of conditional computation, achieving greater than 1000x improvements in model capacity with only minor losses in computational efficiency on modern GPU clusters. We introduce a Sparsely-Gated Mixture-of-Experts layer (MoE), consisting of up to thousands of feed-forward sub-networks. A trainable gating network determines a sparse combination of these experts to use for each example. We apply the MoE to the tasks of language modeling and machine translation, where model capacity is critical for absorbing the vast quantities of knowledge available in the training corpora. We present model architectures in which a MoE with up to 137 billion parameters is applied convolutionally between stacked LSTM layers. On large language modeling and machine translation benchmarks, these models achieve significantly better results than state-of-the-art at lower computational cost.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Noam Shazeer;*Azalia Mirhoseini;*Krzysztof Maziarz;Andy Davis;Quoc Le;Geoffrey Hinton;Jeff Dean", "authorids": "noam@google.com;azalia@google.com;krzysztof.maziarz@student.uj.edu.pl;andydavis@google.com;qvl@google.com;geoffhinton@google.com;jeff@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nshazeer2017,\ntitle={ Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},\nauthor={Noam Shazeer and *Azalia Mirhoseini and *Krzysztof Maziarz and Andy Davis and Quoc Le and Geoffrey Hinton and Jeff Dean},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1ckMDqlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1ckMDqlg", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 23, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 3097, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15166356379734033992&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 16 }, { "id": "B1ewdt9xe", "title": "Deep Predictive Coding Networks for Video Prediction and Unsupervised Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "While great strides have been made in using deep learning algorithms to solve supervised learning tasks, the problem of unsupervised learning - leveraging unlabeled examples to learn about the structure of a domain - remains a difficult unsolved challenge. Here, we explore prediction of future frames in a video sequence as an unsupervised learning rule for learning about the structure of the visual world. We describe a predictive neural network (\"PredNet\") architecture that is inspired by the concept of \"predictive coding\" from the neuroscience literature. These networks learn to predict future frames in a video sequence, with each layer in the network making local predictions and only forwarding deviations from those predictions to subsequent network layers. We show that these networks are able to robustly learn to predict the movement of synthetic (rendered) objects, and that in doing so, the networks learn internal representations that are useful for decoding latent object parameters (e.g. pose) that support object recognition with fewer training views. We also show that these networks can scale to complex natural image streams (car-mounted camera videos), capturing key aspects of both egocentric movement and the movement of objects in the visual scene, and the representation learned in this setting is useful for estimating the steering angle. These results suggest that prediction represents a powerful framework for unsupervised learning, allowing for implicit learning of object and scene structure.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "William Lotter;Gabriel Kreiman;David Cox", "authorids": "lotter@fas.harvard.edu;gabriel.kreiman@tch.harvard.edu;davidcox@fas.harvard.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlotter2017deep,\ntitle={Deep Predictive Coding Networks for Video Prediction and Unsupervised Learning},\nauthor={William Lotter and Gabriel Kreiman and David Cox},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1ewdt9xe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1ewdt9xe", "pdf_size": 0, "rating": "6;8;8", "confidence": "3;5;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1230, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11254875356366916799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15 }, { "id": "B1gtu5ilg", "title": "Transfer of View-manifold Learning to Similarity Perception of Novel Objects", "track": "main", "status": "Poster", "tldr": "DCNN trained with multiple views of the same object can develop human-like perpetual similarity judgment that can transfer to novel objects", "abstract": "We develop a model of perceptual similarity judgment based on re-training a deep convolution neural network (DCNN) that learns to associate different views of each 3D object to capture the notion of object persistence and continuity in our visual experience. The re-training process effectively performs distance metric learning under the object persistency constraints, to modify the view-manifold of object representations. It reduces the effective distance between the representations of different views of the same object without compromising the distance between those of the views of different objects, resulting in the untangling of the view-manifolds between individual objects within the same category and across categories. This untangling enables the model to discriminate and recognize objects within the same category, independent of viewpoints. We found that this ability is not limited to the trained objects, but transfers to novel objects in both trained and untrained categories, as well as to a variety of completely novel artificial synthetic objects. This transfer in learning suggests the modification of distance metrics in view- manifolds is more general and abstract, likely at the levels of parts, and independent of the specific objects or categories experienced during training. Interestingly, the resulting transformation of feature representation in the deep networks is found to significantly better match human perceptual similarity judgment than AlexNet, suggesting that object persistence could be an important constraint in the development of perceptual similarity judgment in biological neural networks.\n", "keywords": "Deep learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Xingyu Lin;Hao Wang;Zhihao Li;Yimeng Zhang;Alan Yuille;Tai Sing Lee", "authorids": "sean.linxingyu@pku.edu.cn;hao.wang@pku.edu.cn;zhihaol@andrew.cmu.edu;yimengzh@andrew.cmu.edu;alan.yuille@jhu.edu;tai@cnbc.cmu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nlin2017transfer,\ntitle={Transfer of View-manifold Learning to Similarity Perception of Novel Objects},\nauthor={Xingyu Lin and Hao Wang and Zhihao Li and Yimeng Zhang and Alan Yuille and Tai Sing Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1gtu5ilg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1gtu5ilg", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;4;3", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": -1.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2263072564235958873&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5 }, { "id": "B1hdzd5lg", "title": "Words or Characters? Fine-grained Gating for Reading Comprehension", "track": "main", "status": "Poster", "tldr": "", "abstract": "Previous work combines word-level and character-level representations using concatenation or scalar weighting, which is suboptimal for high-level tasks like reading comprehension. We present a fine-grained gating mechanism to dynamically combine word-level and character-level representations based on properties of the words. We also extend the idea of fine-grained gating to modeling the interaction between questions and paragraphs for reading comprehension. Experiments show that our approach can improve the performance on reading comprehension tasks, achieving new state-of-the-art results on the Children's Book Test and Who Did What datasets. To demonstrate the generality of our gating mechanism, we also show improved results on a social media tag prediction task.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Zhilin Yang;Bhuwan Dhingra;Ye Yuan;Junjie Hu;William W. Cohen;Ruslan Salakhutdinov", "authorids": "zhiliny@cs.cmu.edu;bdhingra@andrew.cmu.edu;yey1@andrew.cmu.edu;junjieh@cmu.edu;wcohen@cs.cmu.edu;rsalakhu@cs.cmu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nyang2017words,\ntitle={Words or Characters? Fine-grained Gating for Reading Comprehension},\nauthor={Zhilin Yang and Bhuwan Dhingra and Ye Yuan and Junjie Hu and William W. Cohen and Ruslan Salakhutdinov},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1hdzd5lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1hdzd5lg", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2963631440494790998&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "B1jnyXXJx", "title": "Charged Point Normalization: An Efficient Solution to the Saddle Point Problem", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Recently, the problem of local minima in very high dimensional non-convex optimization has been challenged and the problem of saddle points has been introduced. This paper introduces a dynamic type of normalization that forces the system to escape saddle points. Unlike other saddle point escaping algorithms, second order information is not utilized, and the system can be trained with an arbitrary gradient descent learner. The system drastically improves learning in a range of deep neural networks on various data-sets in comparison to non-CPN neural networks.", "keywords": "Deep learning;Computer vision;Optimization", "primary_area": "", "supplementary_material": "", "author": "Armen Aghajanyan", "authorids": "armen.ag@live.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1jnyXXJx", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 1, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WIv4m6IP1w8J:scholar.google.com/&scioq=Charged+Point+Normalization:+An+Efficient+Solution+to+the+Saddle+Point+Problem&hl=en&as_sdt=0,33", "gs_version_total": 4 }, { "id": "B1kJ6H9ex", "title": "Combining policy gradient and Q-learning", "track": "main", "status": "Poster", "tldr": "We combine a policy gradient style update with a Q-learning style update into a single RL algorithm we call PGQL.", "abstract": "Policy gradient is an efficient technique for improving a policy in a reinforcement learning setting. However, vanilla online variants are on-policy only and not able to take advantage of off-policy data. In this paper we describe a new technique that combines policy gradient with off-policy Q-learning, drawing experience from a replay buffer. This is motivated by making a connection between the fixed points of the regularized policy gradient algorithm and the Q-values. This connection allows us to estimate the Q-values from the action preferences of the policy, to which we apply Q-learning updates. We refer to the new technique as \u2018PGQL\u2019, for policy gradient and Q-learning. We also establish an equivalency between action-value fitting techniques and actor-critic algorithms, showing that regularized policy gradient techniques can be interpreted as advantage function learning algorithms. We conclude with some numerical examples that demonstrate improved data efficiency and stability of PGQL. In particular, we tested PGQL on the full suite of Atari games and achieved performance exceeding that of both asynchronous advantage actor-critic (A3C) and Q-learning. \n", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Brendan O'Donoghue;Remi Munos;Koray Kavukcuoglu;Volodymyr Mnih", "authorids": "bodonoghue@google.com;munos@google.com;korayk@google.com;vmnih@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\no'donoghue2017combining,\ntitle={Combining policy gradient and Q-learning},\nauthor={Brendan O'Donoghue and Remi Munos and Koray Kavukcuoglu and Volodymyr Mnih},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1kJ6H9ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=B1kJ6H9ex", "pdf_size": 0, "rating": "7;7;9", "confidence": "4;3;5", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 35, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 190, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14239596076603723468&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "B1mAJI9gl", "title": "Towards Understanding the Invertibility of Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Several recent works have empirically observed that Convolutional Neural Nets (CNNs) are (approximately) invertible. To understand this approximate invertibility phenomenon and how to leverage it more effectively, we focus on a theoretical explanation and develop a mathematical model of sparse signal recovery that is consistent with CNNs with random weights. We give an exact connection to a particular model of model-based compressive sensing (and its recovery algorithms) and random-weight CNNs. We show empirically that several learned networks are consistent with our mathematical analysis and then demonstrate that with such a simple theoretical framework, we can obtain reasonable reconstruction results on real images. We also discuss gaps between our model assumptions and the CNN trained for classification in practical scenarios.", "keywords": "Deep learning;Theory", "primary_area": "", "supplementary_material": "", "author": "Anna C. Gilbert;Yi Zhang;Kibok Lee;Yuting Zhang;Honglak Lee", "authorids": "annacg@umich.edu;yeezhang@umich.edu;kibok@umich.edu;yutingzh@umich.edu;honglak@umich.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ngilbert2017towards,\ntitle={Towards Understanding the Invertibility of Convolutional Neural Networks},\nauthor={Anna C. Gilbert and Yi Zhang and Kibok Lee and Yuting Zhang and Honglak Lee},\nyear={2017},\nurl={https://openreview.net/forum?id=B1mAJI9gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1mAJI9gl", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": -0.944911182523068, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7345402595461995220&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "B1oK8aoxe", "title": "Stochastic Neural Networks for Hierarchical Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "We propose a framework for learning a diverse set of skills using stochastic neural networks with minimum supervision, and utilize these skills in a hierarchical architecture to solve challenging tasks with sparse rewards", "abstract": "Deep reinforcement learning has achieved many impressive results in recent years. However, tasks with sparse rewards or long horizons continue to pose significant challenges. To tackle these important problems, we propose a general framework that first learns useful skills in a pre-training environment, and then leverages the acquired skills for learning faster in downstream tasks.\nOur approach brings together some of the strengths of intrinsic motivation and hierarchical methods: the learning of useful skill is guided by a single proxy reward, the design of which requires very minimal domain knowledge about the downstream tasks. Then a high-level policy is trained on top of these skills, providing a significant improvement of the exploration and allowing to tackle sparse rewards in the downstream tasks. To efficiently pre-train a large span of skills, we use Stochastic Neural Networks combined with an information-theoretic regularizer. Our experiments show that this combination is effective in learning a wide span of interpretable skills in a sample-efficient way, and can significantly boost the learning performance uniformly across a wide range of downstream tasks.", "keywords": "Deep learning;Unsupervised Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Carlos Florensa;Yan Duan;Pieter Abbeel", "authorids": "florensa@berkeley.edu;rocky@openai.com;pieter@openai.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nflorensa2017stochastic,\ntitle={Stochastic Neural Networks for Hierarchical Reinforcement Learning},\nauthor={Carlos Florensa and Yan Duan and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1oK8aoxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1oK8aoxe", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 445, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15662620749719187568&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "B1s6xvqlx", "title": "Recurrent Environment Simulators", "track": "main", "status": "Poster", "tldr": "", "abstract": "Models that can simulate how environments change in response to actions can be used by agents to plan and act efficiently. We improve on previous environment simulators from high-dimensional pixel observations by introducing recurrent neural networks that are able to make temporally and spatially coherent predictions for hundreds of time-steps into the future. We present an in-depth analysis of the factors affecting performance, providing the most extensive attempt to advance the understanding of the properties of these models. We address the issue of computationally inefficiency with a model that does not need to generate a high-dimensional image at each time-step. We show that our approach can be used to improve exploration and is adaptable to many diverse environments, namely 10 Atari games, a 3D car racing environment, and complex 3D mazes.", "keywords": "Deep learning;Unsupervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Silvia Chiappa;S\u00e9bastien Racaniere;Daan Wierstra;Shakir Mohamed", "authorids": "csilvia@google.com;sracaniere@google.com;wierstra@google.com;shakir@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchiappa2017recurrent,\ntitle={Recurrent Environment Simulators},\nauthor={Silvia Chiappa and S{\\'e}bastien Racaniere and Daan Wierstra and Shakir Mohamed},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=B1s6xvqlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=B1s6xvqlx", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;5;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": 0.18898223650461357, "gs_citation": 247, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6576274297774475679&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "B1vRTeqxg", "title": "Learning Continuous Semantic Representations of Symbolic Expressions", "track": "main", "status": "Workshop", "tldr": "Assign continuous vectors to logical and algebraic symbolic expressions in such a way that semantically equivalent, but syntactically diverse expressions are assigned to identical (or highly similar) continuous vectors.", "abstract": "The question of how procedural knowledge is represented and inferred is a fundamental problem in machine learning and artificial intelligence. Recent work on program induction has proposed neural architectures, based on abstractions like stacks, Turing machines, and interpreters, that operate on abstract computational machines or on execution traces. But the recursive abstraction that is central to procedural knowledge is perhaps most naturally represented by symbolic representations that have syntactic structure, such as logical expressions and source code. Combining abstract, symbolic reasoning with continuous neural reasoning is a grand challenge of representation learning. As a step in this direction, we propose a new architecture, called neural equivalence networks, for the problem of learning continuous semantic representations of mathematical and logical expressions. These networks are trained to represent semantic equivalence, even of expressions that are syntactically very different. The challenge is that semantic representations must be computed in a syntax-directed manner, because semantics is compositional, but at the same time, small changes in syntax can lead to very large changes in semantics, which can be difficult for continuous neural architectures. We perform an exhaustive evaluation on the task of checking equivalence on a highly diverse class of symbolic algebraic and boolean expression types, showing that our model significantly outperforms existing architectures.\n", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Miltiadis Allamanis;Pankajan Chanthirasegaran;Pushmeet Kohli;Charles Sutton", "authorids": "m.allamanis@ed.ac.uk;pankajan.chanthirasegaran@ed.ac.uk;pkohli@microsoft.com;csutton@ed.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1vRTeqxg", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;4;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11184386302743667447&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "BJ--gPcxl", "title": "Semi-Supervised Learning with Context-Conditional Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "Training GANs to in-paint images produces feature representations that yield leading results on various benchmarks.", "abstract": "We introduce a simple semi-supervised learning approach for images based on in-painting using an adversarial loss. Images with random patches removed are presented to a generator whose task is to fill in the hole, based on the surrounding pixels. The in-painted images are then presented to a discriminator network that judges if they are real (unaltered training images) or not. This task acts as a regularizer for standard supervised training of the discriminator. Using our approach we are able to directly train large VGG-style networks in a semi-supervised fashion. We evaluate on STL-10 and PASCAL datasets, where our approach obtains performance comparable or superior to existing methods.\n", "keywords": "Deep learning;Semi-Supervised Learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Emily Denton;Sam Gross;Rob Fergus", "authorids": "denton@cs.nyu.edu;sgross@fb.com;robfergus@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndenton2017semisupervised,\ntitle={Semi-Supervised Learning with Context-Conditional Generative Adversarial Networks},\nauthor={Emily Denton and Sam Gross and Rob Fergus},\nyear={2017},\nurl={https://openreview.net/forum?id=BJ--gPcxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJ--gPcxl", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 212, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9120307326237904208&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJ0Ee8cxx", "title": "Hierarchical Memory Networks", "track": "main", "status": "Reject", "tldr": "We propose a hierarchical memory organization strategy for efficient memory access in memory networks with large memory.", "abstract": "Memory networks are neural networks with an explicit memory component that can be both read and written to by the network. The memory is often addressed in a soft way using a softmax function, making end-to-end training with backpropagation possible. However, this is not computationally scalable for applications which require the network to read from extremely large memories. On the other hand, it is well known that hard attention mechanisms based on reinforcement learning are challenging to train successfully. In this paper, we explore a form of hierarchical memory network, which can be considered as a hybrid between hard and soft attention memory networks. The memory is organized in a hierarchical structure such that reading from it is done with less computation than soft attention over a flat memory, while also being easier to train than hard attention over a flat memory. Specifically, we propose to incorporate Maximum Inner Product Search (MIPS) in the training and inference procedures for our hierarchical memory network. We explore the use of various state-of-the art approximate MIPS techniques and report results on SimpleQuestions, a challenging large scale factoid question answering task.\n", "keywords": "Deep learning;Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Sarath Chandar;Sungjin Ahn;Hugo Larochelle;Pascal Vincent;Gerald Tesauro;Yoshua Bengio", "authorids": "apsarathchandar@gmail.com;sjn.ahn@gmail.com;hugo@twitter.com;vincentp@iro.umontreal.ca;gtesauro@us.ibm.com;yoshua.bengio@umontreal.ca", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nchandar2017hierarchical,\ntitle={Hierarchical Memory Networks},\nauthor={Sarath Chandar and Sungjin Ahn and Hugo Larochelle and Pascal Vincent and Gerald Tesauro and Yoshua Bengio},\nyear={2017},\nurl={https://openreview.net/forum?id=BJ0Ee8cxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJ0Ee8cxx", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;5", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4946890069532779966&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BJ3filKll", "title": "Efficient Representation of Low-Dimensional Manifolds using Deep Networks", "track": "main", "status": "Poster", "tldr": "We show constructively that deep networks can learn to represent manifold data efficiently", "abstract": "We consider the ability of deep neural networks to represent data that lies near a low-dimensional manifold in a high-dimensional space. We show that deep networks can efficiently extract the intrinsic, low-dimensional coordinates of such data. Specifically we show that the first two layers of a deep network can exactly embed points lying on a monotonic chain, a special type of piecewise linear manifold, mapping them to a low-dimensional Euclidean space. Remarkably, the network can do this using an almost optimal number of parameters. We also show that this network projects nearby points onto the manifold and then embeds them with little error. Experiments demonstrate that training with stochastic gradient descent can indeed find efficient representations similar to the one presented in this paper.", "keywords": "Theory;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Ronen Basri;David W. Jacobs", "authorids": "ronen.basri@weizmann.ac.il;djacobs@cs.umd.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbasri2017efficient,\ntitle={Efficient Representation of Low-Dimensional Manifolds using Deep Networks},\nauthor={Ronen Basri and David W. Jacobs},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJ3filKll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=BJ3filKll", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;5", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2779422764043753318&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "BJ46w6Ule", "title": "Dynamic Partition Models", "track": "main", "status": "Reject", "tldr": "Learning of compact binary representations through partitioning of the variables", "abstract": "We present a new approach for learning compact and intuitive distributed representations with binary encoding. Rather than summing up expert votes as in products of experts, we employ for each variable the opinion of the most reliable expert. Data points are hence explained through a partitioning of the variables into expert supports. The partitions are dynamically adapted based on which experts are active. During the learning phase we adopt a smoothed version of this model that uses separate mixtures for each data dimension. In our experiments we achieve accurate reconstructions of high-dimensional data points with at most a dozen experts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marc Goessling;Yali Amit", "authorids": "goessling@uchicago.edu;goessling@uchicago.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngoessling2017dynamic,\ntitle={Dynamic Partition Models},\nauthor={Marc Goessling and Yali Amit},\nyear={2017},\nurl={https://openreview.net/forum?id=BJ46w6Ule}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJ46w6Ule", "pdf_size": 0, "rating": "3;3;6", "confidence": "4;4;3", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jI5LZoM9WsAJ:scholar.google.com/&scioq=Dynamic+Partition+Models&hl=en&as_sdt=0,33", "gs_version_total": 4 }, { "id": "BJ5UeU9xx", "title": "Visualizing Deep Neural Network Decisions: Prediction Difference Analysis", "track": "main", "status": "Poster", "tldr": "Method for visualizing evidence for and against deep convolutional neural network classification decisions in a given input image.", "abstract": "This article presents the prediction difference analysis method for visualizing the response of a deep neural network to a specific input. When classifying images, the method highlights areas in a given input image that provide evidence for or against a certain class. It overcomes several shortcoming of previous methods and provides great additional insight into the decision making process of classifiers. Making neural network decisions interpretable through visualization is important both to improve models and to accelerate the adoption of black-box classifiers in application areas such as medicine. We illustrate the method in experiments on natural images (ImageNet data), as well as medical images (MRI brain scans).", "keywords": "Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Luisa M Zintgraf;Taco S Cohen;Tameem Adel;Max Welling", "authorids": "lmzintgraf@gmail.com;t.s.cohen@uva.nl;tameem.hesham@gmail.com;m.welling@uva.nl", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzintgraf2017visualizing,\ntitle={Visualizing Deep Neural Network Decisions: Prediction Difference Analysis},\nauthor={Luisa M Zintgraf and Taco S Cohen and Tameem Adel and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJ5UeU9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJ5UeU9xx", "pdf_size": 0, "rating": "6;6;9", "confidence": "5;4;4", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": -0.5, "gs_citation": 930, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13321146675816614452&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "BJ6oOfqge", "title": "Temporal Ensembling for Semi-Supervised Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this paper, we present a simple and efficient method for training deep neural networks in a semi-supervised setting where only a small portion of training data is labeled. We introduce self-ensembling, where we form a consensus prediction of the unknown labels using the outputs of the network-in-training on different epochs, and most importantly, under different regularization and input augmentation conditions. This ensemble prediction can be expected to be a better predictor for the unknown labels than the output of the network at the most recent training epoch, and can thus be used as a target for training. Using our method, we set new records for two standard semi-supervised learning benchmarks, reducing the (non-augmented) classification error rate from 18.44% to 7.05% in SVHN with 500 labels and from 18.63% to 16.55% in CIFAR-10 with 4000 labels, and further to 5.12% and 12.16% by enabling the standard augmentations. We additionally obtain a clear improvement in CIFAR-100 classification accuracy by using random images from the Tiny Images dataset as unlabeled extra inputs during training. Finally, we demonstrate good tolerance to incorrect labels.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Samuli Laine;Timo Aila", "authorids": "slaine@nvidia.com;taila@nvidia.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nlaine2017temporal,\ntitle={Temporal Ensembling for Semi-Supervised Learning},\nauthor={Samuli Laine and Timo Aila},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJ6oOfqge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJ6oOfqge", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;5;4", "rating_avg": 8.0, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 3503, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12742815032693937464&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6 }, { "id": "BJ8fyHceg", "title": "Tuning Recurrent Neural Networks with Reinforcement Learning", "track": "main", "status": "Workshop", "tldr": "RL Tuner is a method for refining an LSTM trained on data by using RL to impose desired behaviors, while maintaining good predictive properties learned from data.", "abstract": "The approach of training sequence models using supervised learning and next-step prediction suffers from known failure modes. For example, it is notoriously difficult to ensure multi-step generated sequences have coherent global structure. We propose a novel sequence-learning approach in which we use a pre-trained Recurrent Neural Network (RNN) to supply part of the reward value in a Reinforcement Learning (RL) model. Thus, we can refine a sequence predictor by optimizing for some imposed reward functions, while maintaining good predictive properties learned from data. We propose efficient ways to solve this by augmenting deep Q-learning with a cross-entropy reward and deriving novel off-policy methods for RNNs from KL control. We explore the usefulness of our approach in the context of music generation. An LSTM is trained on a large corpus of songs to predict the next note in a musical sequence. This Note RNN is then refined using our method and rules of music theory. We show that by combining maximum likelihood (ML) and RL in this way, we can not only produce more pleasing melodies, but significantly reduce unwanted behaviors and failure modes of the RNN, while maintaining information learned from data.", "keywords": "Deep learning;Reinforcement Learning;Structured prediction;Supervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Natasha Jaques;Shixiang Gu;Richard E. Turner;Douglas Eck", "authorids": "jaquesn@mit.edu;sg717@cam.ac.uk;ret26@cam.ack.uk;deck@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=BJ8fyHceg", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;5;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15794613327819738797&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJ9fZNqle", "title": "Multi-modal Variational Encoder-Decoders", "track": "main", "status": "Reject", "tldr": "Learning continuous multimodal latent variables in the variational auto-encoder framework for text processing applications.", "abstract": "Recent advances in neural variational inference have facilitated efficient training of powerful directed graphical models with continuous latent variables, such as variational autoencoders. However, these models usually assume simple, uni-modal priors \u2014 such as the multivariate Gaussian distribution \u2014 yet many real-world data distributions are highly complex and multi-modal. Examples of complex and multi-modal distributions range from topics in newswire text to conversational dialogue responses. When such latent variable models are applied to these domains, the restriction of the simple, uni-modal prior hinders the overall expressivity of the learned model as it cannot possibly capture more complex aspects of the data distribution. To overcome this critical restriction, we propose a flexible, simple prior distribution which can be learned efficiently and potentially capture an exponential number of modes of a target distribution. We develop the multi-modal variational encoder-decoder framework and investigate the effectiveness of the proposed prior in several natural language processing modeling tasks, including document modeling and dialogue modeling.", "keywords": "Deep learning;Structured prediction;Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Iulian V. Serban;Alexander G. Ororbia II;Joelle Pineau;Aaron Courville", "authorids": "julianserban@gmail.com;ago109@psu.edu;jpineau@cs.mcgill.ca;aaron.courville@umontreal.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nserban2017multimodal,\ntitle={Multi-modal Variational Encoder-Decoders},\nauthor={Iulian V. Serban and Alexander G. Ororbia II and Joelle Pineau and Aaron Courville},\nyear={2017},\nurl={https://openreview.net/forum?id=BJ9fZNqle}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=BJ9fZNqle", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 27, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8310138015806522836&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BJAA4wKxg", "title": "A Convolutional Encoder Model for Neural Machine Translation", "track": "main", "status": "Reject", "tldr": "Investigate encoder models for translation and demonstrate that convolutions can outperform LSTMs as encoders.", "abstract": "The prevalent approach to neural machine translation relies on bi-directional LSTMs to encode the source sentence.\nIn this paper we present a faster and simpler architecture based on a succession of convolutional layers. \nThis allows to encode the entire source sentence simultaneously compared to recurrent networks for which computation is constrained by temporal dependencies.\nOn WMT'16 English-Romanian translation we achieve competitive accuracy to the state-of-the-art and we outperform several recently published results on the WMT'15 English-German task. \nOur models obtain almost the same accuracy as a very deep LSTM setup on WMT'14 English-French translation.\nOur convolutional encoder speeds up CPU decoding by more than two times at the same or higher accuracy as a strong bi-directional LSTM baseline. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jonas Gehring;Michael Auli;David Grangier;Yann N. Dauphin", "authorids": "jgehring@fb.com;michaelauli@fb.com;grangier@fb.com;ynd@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngehring2017a,\ntitle={A Convolutional Encoder Model for Neural Machine Translation},\nauthor={Jonas Gehring and Michael Auli and David Grangier and Yann N. Dauphin},\nyear={2017},\nurl={https://openreview.net/forum?id=BJAA4wKxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=BJAA4wKxg", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;3", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 621, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13078160224216368728&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "id": "BJAFbaolg", "title": "Learning to Generate Samples from Noise through Infusion Training", "track": "main", "status": "Poster", "tldr": "We learn a markov transition operator acting on inputspace, to denoise random noise into a target distribution. We use a novel target injection technique to guide the training.", "abstract": "In this work, we investigate a novel training procedure to learn a generative model as the transition operator of a Markov chain, such that, when applied repeatedly on an unstructured random noise sample, it will denoise it into a sample that matches the target distribution from the training set. The novel training procedure to learn this progressive denoising operation involves sampling from a slightly different chain than the model chain used for generation in the absence of a denoising target. In the training chain we infuse information from the training target example that we would like the chains to reach with a high probability. The thus learned transition operator is able to produce quality and varied samples in a small number of steps. Experiments show competitive results compared to the samples generated with a basic Generative Adversarial Net. ", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Florian Bordes;Sina Honari;Pascal Vincent", "authorids": "florian.bordes@umontreal.ca;sina.honari@umontreal.ca;pascal.vincent@umontreal.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbordes2017learning,\ntitle={Learning to Generate Samples from Noise through Infusion Training},\nauthor={Florian Bordes and Sina Honari and Pascal Vincent},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJAFbaolg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=BJAFbaolg", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;5;5", "rating_avg": 7.0, "confidence_avg": 4.666666666666667, "replies_avg": 19, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3990724804432375369&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BJC8LF9ex", "title": "Recurrent Neural Networks for Multivariate Time Series with Missing Values", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multivariate time series data in practical applications, such as health care, geoscience, and biology, are characterized by a variety of missing values. In time series prediction and other related tasks, it has been noted that missing values and their missing patterns are often correlated with the target labels, a.k.a., informative missingness. There is very limited work on exploiting the missing patterns for effective imputation and improving prediction performance. In this paper, we develop novel deep learning models, namely GRU-D, as one of the early attempts. GRU-D is based on Gated Recurrent Units (GRU), a state-of-the-art recurrent neural network. It takes two representations of missing patterns, i.e., masking and time interval, and effectively incorporates them into a deep model architecture so that it not only captures the long-term temporal dependencies in time series, but also utilizes the missing patterns to achieve better prediction results. Experiments of time series classification tasks on real-world clinical datasets (MIMIC-III, PhysioNet) and synthetic datasets demonstrate that our models achieve state-of-the-art performance and provides useful insights for better understanding and utilization of missing values in time series analysis.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Zhengping Che;Sanjay Purushotham;Kyunghyun Cho;David Sontag;Yan Liu", "authorids": "zche@usc.edu;spurusho@usc.edu;kyunghyun.cho@nyu.edu;dsontag@cs.nyu.edu;yanliu.cs@usc.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nche2017recurrent,\ntitle={Recurrent Neural Networks for Multivariate Time Series with Missing Values},\nauthor={Zhengping Che and Sanjay Purushotham and Kyunghyun Cho and David Sontag and Yan Liu},\nyear={2017},\nurl={https://openreview.net/forum?id=BJC8LF9ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer5", "site": "https://openreview.net/forum?id=BJC8LF9ex", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 2644, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16779387427970895511&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15 }, { "id": "BJC_jUqxe", "title": "A STRUCTURED SELF-ATTENTIVE SENTENCE EMBEDDING", "track": "main", "status": "Poster", "tldr": "a new model for extracting an interpretable sentence embedding by introducing self-attention and matrix representation.", "abstract": "This paper proposes a new model for extracting an interpretable sentence embedding by introducing self-attention. Instead of using a vector, we use a 2-D matrix to represent the embedding, with each row of the matrix attending on a different part of the sentence. We also propose a self-attention mechanism and a special regularization term for the model. As a side effect, the embedding comes with an easy way of visualizing what specific parts of the sentence are encoded into the embedding. We evaluate our model on 3 different tasks: author profiling, sentiment classification and textual entailment. Results show that our model yields a significant performance gain compared to other sentence embedding methods in all of the 3 tasks.", "keywords": "Natural language processing;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Zhouhan Lin;Minwei Feng;Cicero Nogueira dos Santos;Mo Yu;Bing Xiang;Bowen Zhou;Yoshua Bengio", "authorids": "lin.zhouhan@gmail.com;mfeng@us.ibm.com;cicerons@us.ibm.com;yum@us.ibm.com;bingxia@us.ibm.com;zhou@us.ibm.com;yoshua.bengio@umontreal.ca", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nlin2017a,\ntitle={A {STRUCTURED} {SELF}-{ATTENTIVE} {SENTENCE} {EMBEDDING}},\nauthor={Zhouhan Lin and Minwei Feng and Cicero Nogueira dos Santos and Mo Yu and Bing Xiang and Bowen Zhou and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJC_jUqxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJC_jUqxe", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;5;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 7, "corr_rating_confidence": -0.18898223650461357, "gs_citation": 2949, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3666844900655302515&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BJFG8Yqxl", "title": "Group Sparse CNNs for Question Sentence Classification with Answer Sets", "track": "main", "status": "Reject", "tldr": "", "abstract": "Classifying question sentences into their corresponding categories is an important task with wide applications, for example in many websites' FAQ sections. \nHowever, traditional question classification techniques do not fully utilize the well-prepared answer data which has great potential for improving question sentence representations which could lead to better classification performance. In order to encode answer information into question representation, we first introduce novel group sparse autoencoders which could utilize the group information in the answer set to refine question representation. We then propose a new group sparse convolutional neural network which could naturally learn the question representation with respect to their corresponding answers by implanting the group sparse autoencoders into the traditional convolutional neural network. The proposed model show significant improvements over strong baselines on four datasets. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingbo Ma;Liang Huang;Bing Xiang;Bowen Zhou", "authorids": "mam@oregonstate.edu;liang.huang@oregonstate.edu;bingxia@us.ibm.com;zhou@us.ibm.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nma2017group,\ntitle={Group Sparse {CNN}s for Question Sentence Classification with Answer Sets},\nauthor={Mingbo Ma and Liang Huang and Bing Xiang and Bowen Zhou},\nyear={2017},\nurl={https://openreview.net/forum?id=BJFG8Yqxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJFG8Yqxl", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eaNKcawIBbUJ:scholar.google.com/&scioq=Group+Sparse+CNNs+for+Question+Sentence+Classification+with+Answer+Sets&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BJK3Xasel", "title": "Nonparametric Neural Networks", "track": "main", "status": "Poster", "tldr": "We automatically set the size of an MLP by adding and removing units during training as appropriate.", "abstract": "Automatically determining the optimal size of a neural network for a given task without prior information currently requires an expensive global search and training many networks from scratch. In this paper, we address the problem of automatically finding a good network size during a single training cycle. We introduce {\\it nonparametric neural networks}, a non-probabilistic framework for conducting optimization over all possible network sizes and prove its soundness when network growth is limited via an $\\ell_p$ penalty. We train networks under this framework by continuously adding new units while eliminating redundant units via an $\\ell_2$ penalty. We employ a novel optimization algorithm, which we term ``Adaptive Radial-Angular Gradient Descent'' or {\\it AdaRad}, and obtain promising results.", "keywords": "Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "George Philipp;Jaime G. Carbonell", "authorids": "george.philipp@email.de;jgc@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nphilipp2017nonparametric,\ntitle={Nonparametric Neural Networks},\nauthor={George Philipp and Jaime G. Carbonell},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJK3Xasel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJK3Xasel", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 2, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13687549300042375566&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "BJKYvt5lg", "title": "PixelVAE: A Latent Variable Model for Natural Images", "track": "main", "status": "Poster", "tldr": "VAE with an autoregressive PixelCNN-based decoder with strong performance on binarized MNIST, ImageNet 64x64, and LSUN bedrooms.", "abstract": "Natural image modeling is a landmark challenge of unsupervised learning. Variational Autoencoders (VAEs) learn a useful latent representation and model global structure well but have difficulty capturing small details. PixelCNN models details very well, but lacks a latent code and is difficult to scale for capturing large structures. We present PixelVAE, a VAE model with an autoregressive decoder based on PixelCNN. Our model requires very few expensive autoregressive layers compared to PixelCNN and learns latent codes that are more compressed than a standard VAE while still capturing most non-trivial structure. Finally, we extend our model to a hierarchy of latent variables at different scales. Our model achieves state-of-the-art performance on binarized MNIST, competitive performance on 64 \u00d7 64 ImageNet, and high-quality samples on the LSUN bedrooms dataset.\n", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Ishaan Gulrajani;Kundan Kumar;Faruk Ahmed;Adrien Ali Taiga;Francesco Visin;David Vazquez;Aaron Courville", "authorids": "igul222@gmail.com;kundankumar2510@gmail.com;faruk.ahmed.91@gmail.com;adrien.alitaiga@gmail.com;francesco.visin@polimi.it;dvazquez@cvc.uab.es;aaron.courville@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\ngulrajani2017pixelvae,\ntitle={Pixel{VAE}: A Latent Variable Model for Natural Images},\nauthor={Ishaan Gulrajani and Kundan Kumar and Faruk Ahmed and Adrien Ali Taiga and Francesco Visin and David Vazquez and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJKYvt5lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=BJKYvt5lg", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 7, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 420, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4969065840828680485&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "BJO-BuT1g", "title": "A Learned Representation For Artistic Style", "track": "main", "status": "Poster", "tldr": "A deep neural network to learn and combine artistic styles.", "abstract": "The diversity of painting styles represents a rich visual vocabulary for the construction of an image. The degree to which one may learn and parsimoniously capture this visual vocabulary measures our understanding of the higher level features of paintings, if not images in general. In this work we investigate the construction of a single, scalable deep network that can parsimoniously capture the artistic style of a diversity of paintings. We demonstrate that such a network generalizes across a diversity of artistic styles by reducing a painting to a point in an embedding space. Importantly, this model permits a user to explore new painting styles by arbitrarily combining the styles learned from individual paintings. We hope that this work provides a useful step towards building rich models of paintings and offers a window on to the structure of the learned representation of artistic style.", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Vincent Dumoulin;Jonathon Shlens;Manjunath Kudlur", "authorids": "vi.dumoulin@gmail.com;shlens@google.com;keveman@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndumoulin2017a,\ntitle={A Learned Representation For Artistic Style},\nauthor={Vincent Dumoulin and Jonathon Shlens and Manjunath Kudlur},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJO-BuT1g}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJO-BuT1g", "pdf_size": 0, "rating": "7;8;8", "confidence": "3;5;5", "rating_avg": 7.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 22, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1408, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7122040962029266183&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "BJRIA3Fgg", "title": "Modularized Morphing of Neural Networks", "track": "main", "status": "Workshop", "tldr": "", "abstract": "In this work we study the problem of network morphism, an effective learning scheme to morph a well-trained neural network to a new one with the network function completely preserved. Different from existing work where basic morphing types on the layer level were addressed, we target at the central problem of network morphism at a higher level, i.e., how a convolutional layer can be morphed into an arbitrary module of a neural network. To simplify the representation of a network, we abstract a module as a graph with blobs as vertices and convolutional layers as edges, based on which the morphing process is able to be formulated as a graph transformation problem. Two atomic morphing operations are introduced to compose the graphs, based on which modules are classified into two families, i.e., simple morphable modules and complex modules. We present practical morphing solutions for both of these two families, and prove that any reasonable module can be morphed from a single convolutional layer. Extensive experiments have been conducted based on the state-of-the-art ResNet on benchmark datasets, and the effectiveness of the proposed solution has been verified.", "keywords": "Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Tao Wei;Changhu Wang;Chang Wen Chen", "authorids": "taowei@buffalo.edu;chw@microsoft.com;chencw@buffalo.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJRIA3Fgg", "pdf_size": 0, "rating": "5;6;7;7", "confidence": "4;5;4;5", "rating_avg": 6.25, "confidence_avg": 4.5, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.30151134457776363, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12625898445651317771&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJVEEF9lx", "title": "Learning Approximate Distribution-Sensitive Data Structures", "track": "main", "status": "Reject", "tldr": "We model mental representations as abstract distribution-sensitive data types and synthesize concrete implementations using deep networks from specification", "abstract": "We present a computational model of mental representations as data-structures which are distribution sensitive, i.e., which exploit non-uniformity in their usage patterns to reduce time or space complexity.\nAbstract data types equipped with axiomatic specifications specify classes of concrete data structures with equivalent logical behavior.\nWe extend this formalism to distribution-sensitive data structures with the concept of a probabilistic axiomatic specification, which is implemented by a concrete data structure only with some probability.\nWe employ a number of approximations to synthesize several distribution-sensitive data structures from probabilistic specification as deep neural networks, such as a stack, queue, natural number, set, and binary tree.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Zenna Tavares;Armando Solar-Lezama", "authorids": "zenna@mit.edu;asolar@csail.mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntavares2017learning,\ntitle={Learning Approximate Distribution-Sensitive Data Structures},\nauthor={Zenna Tavares and Armando Solar-Lezama},\nyear={2017},\nurl={https://openreview.net/forum?id=BJVEEF9lx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJVEEF9lx", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RD6nRAmc5ScJ:scholar.google.com/&scioq=Learning+Approximate+Distribution-Sensitive+Data+Structures&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "BJYwwY9ll", "title": "Snapshot Ensembles: Train 1, Get M for Free", "track": "main", "status": "Poster", "tldr": "", "abstract": "Ensembles of neural networks are known to be much more robust and accurate than individual networks. However, training multiple deep networks for model averaging is computationally expensive. In this paper, we propose a method to obtain the seemingly contradictory goal of ensembling multiple neural networks at no additional training cost. We achieve this goal by training a single neural network, converging to several local minima along its optimization path and saving the model parameters. To obtain repeated rapid convergence, we leverage recent work on cyclic learning rate schedules. The resulting technique, which we refer to as Snapshot Ensembling, is simple, yet surprisingly effective. We show in a series of experiments that our approach is compatible with diverse network architectures and learning tasks. It consistently yields lower error rates than state-of-the-art single models at no additional training cost, and compares favorably with traditional network ensembles. On CIFAR-10 and CIFAR-100 our DenseNet Snapshot Ensembles obtain error rates of 3.4% and 17.4% respectively.", "keywords": "Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Gao Huang;Yixuan Li;Geoff Pleiss;Zhuang Liu;John E. Hopcroft;Kilian Q. Weinberger", "authorids": "gh349@cornell.edu;yl2363@cornell.edu;geoff@cs.cornell.edu;liuzhuangthu@gmail.com;jeh@cs.cornell.edu;kqw4@cornell.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nhuang2017snapshot,\ntitle={Snapshot Ensembles: Train 1, Get M for Free},\nauthor={Gao Huang and Yixuan Li and Geoff Pleiss and Zhuang Liu and John E. Hopcroft and Kilian Q. Weinberger},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJYwwY9ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJYwwY9ll", "pdf_size": 0, "rating": "7;8;9", "confidence": "5;3;4", "rating_avg": 8.0, "confidence_avg": 4.0, "replies_avg": 36, "authors#_avg": 6, "corr_rating_confidence": -0.5, "gs_citation": 1227, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13258787322136448860&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "BJ_MGwqlg", "title": "Rethinking Numerical Representations for Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "We find that the optimal numerical representation for large-scale DNNs is very different than the small-scale ones that are used in current DNN hardware research.", "abstract": "With ever-increasing computational demand for deep learning, it is critical to investigate the implications of the numeric representation and precision of DNN model weights and activations on computational efficiency. In this work, we explore unconventional narrow-precision floating-point representations as it relates to inference accuracy and efficiency to steer the improved design of future DNN platforms. We show that inference using these custom numeric representations on production-grade DNNs, including GoogLeNet and VGG, achieves an average speedup of 7.6x with less than 1% degradation in inference accuracy relative to a state-of-the-art baseline platform representing the most sophisticated hardware using single-precision floating point. To facilitate the use of such customized precision, we also present a novel technique that drastically reduces the time required to derive the optimal precision configuration.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Parker Hill;Babak Zamirai;Shengshuo Lu;Yu-Wei Chao;Michael Laurenzano;Mehrzad Samadi;Marios Papaefthymiou;Scott Mahlke;Thomas Wenisch;Jia Deng;Lingjia Tang;Jason Mars", "authorids": "parkerhh@umich.edu;zamirai@umich.edu;luss@umich.edu;ywchao@umich.edu;mlaurenz@umich.edu;mehrzads@umich.edu;marios@umich.edu;mahlke@umich.edu;twenisch@umich.edu;jiadeng@umich.edu;lingjia@umich.edu;profmars@umich.edu", "gender": ";;;;;;;;;;;", "homepage": ";;;;;;;;;;;", "dblp": ";;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;", "orcid": ";;;;;;;;;;;", "linkedin": ";;;;;;;;;;;", "or_profile": ";;;;;;;;;;;", "aff": ";;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;", "position": ";;;;;;;;;;;", "bibtex": "@misc{\nhill2017rethinking,\ntitle={Rethinking Numerical Representations for Deep Neural Networks},\nauthor={Parker Hill and Babak Zamirai and Shengshuo Lu and Yu-Wei Chao and Michael Laurenzano and Mehrzad Samadi and Marios Papaefthymiou and Scott Mahlke and Thomas Wenisch and Jia Deng and Lingjia Tang and Jason Mars},\nyear={2017},\nurl={https://openreview.net/forum?id=BJ_MGwqlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJ_MGwqlg", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;2;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 12, "corr_rating_confidence": -0.1889822365046136, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11470471489865850829&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BJa0ECFxe", "title": "Information Dropout: learning optimal representations through noise", "track": "main", "status": "Reject", "tldr": "We introduce Information Dropout, an information theoretic generalization of dropout that highlights how injecting noise can help in learning invariant representations.", "abstract": "We introduce Information Dropout, a generalization of dropout that is motivated by the Information Bottleneck principle and highlights the way in which injecting noise in the activations can help in learning optimal representations of the data. Information Dropout is rooted in information theoretic principles, it includes as special cases several existing dropout methods, like Gaussian Dropout and Variational Dropout, and, unlike classical dropout, it can learn and build representations that are invariant to nuisances of the data, like occlusions and clutter. When the task is the reconstruction of the input, we show that the information dropout method yields a variational autoencoder as a special case, thus providing a link between representation learning, information theory and variational inference. Our experiments validate the theoretical intuitions behind our method, and we find that information dropout achieves a comparable or better generalization performance than binary dropout, especially on smaller models, since it can automatically adapt the noise to the structure of the network, as well as to the test sample.", "keywords": "Theory;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Alessandro Achille;Stefano Soatto", "authorids": "achille@cs.ucla.edu;soatto@cs.ucla.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nachille2017information,\ntitle={Information Dropout: learning optimal representations through noise},\nauthor={Alessandro Achille and Stefano Soatto},\nyear={2017},\nurl={https://openreview.net/forum?id=BJa0ECFxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJa0ECFxe", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9275729676917115480&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJbD_Pqlg", "title": "Human perception in computer vision", "track": "main", "status": "Reject", "tldr": "Correlates for several properties of human perception emerge in convolutional neural networks following image categorization learning.", "abstract": "Computer vision has made remarkable progress in recent years. Deep neural network (DNN) models optimized to identify objects in images exhibit unprecedented task-trained accuracy and, remarkably, some generalization ability: new visual problems can now be solved more easily based on previous learning. Biological vision (learned in life and through evolution) is also accurate and general-purpose. Is it possible that these different learning regimes converge to similar problem-dependent optimal computations? We therefore asked whether the human system-level computation of visual perception has DNN correlates and considered several anecdotal test cases. We found that perceptual sensitivity to image changes has DNN mid-computation correlates, while sensitivity to segmentation, crowding and shape has DNN end-computation correlates. Our results quantify the applicability of using DNN computation to estimate perceptual loss, and are consistent with the fascinating theoretical view that properties of human perception are a consequence of architecture-independent visual learning.", "keywords": "Computer vision;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Ron Dekel", "authorids": "ron.dekel@weizmann.ac.il", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ndekel2017human,\ntitle={Human perception in computer vision},\nauthor={Ron Dekel},\nyear={2017},\nurl={https://openreview.net/forum?id=BJbD_Pqlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJbD_Pqlg", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 1, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16036336513094776149&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "BJh6Ztuxl", "title": "Fine-grained Analysis of Sentence Embeddings Using Auxiliary Prediction Tasks", "track": "main", "status": "Poster", "tldr": "A method for analyzing sentence embeddings on a fine-grained level using auxiliary prediction tasks", "abstract": "There is a lot of research interest in encoding variable length sentences into fixed\nlength vectors, in a way that preserves the sentence meanings. Two common\nmethods include representations based on averaging word vectors, and representations based on the hidden states of recurrent neural networks such as LSTMs.\nThe sentence vectors are used as features for subsequent machine learning tasks\nor for pre-training in the context of deep learning. However, not much is known\nabout the properties that are encoded in these sentence representations and about\nthe language information they capture.\nWe propose a framework that facilitates better understanding of the encoded representations. We define prediction tasks around isolated aspects of sentence structure (namely sentence length, word content, and word order), and score representations by the ability to train a classifier to solve each prediction task when\nusing the representation as input. We demonstrate the potential contribution of the\napproach by analyzing different sentence representation mechanisms. The analysis sheds light on the relative strengths of different sentence embedding methods with respect to these low level prediction tasks, and on the effect of the encoded\nvector\u2019s dimensionality on the resulting representations.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Yossi Adi;Einat Kermany;Yonatan Belinkov;Ofer Lavi;Yoav Goldberg", "authorids": "yossiadidrum@gmail.com;einatke@il.ibm.com;belinkov@mit.edu;oferl@il.ibm.com;yoav.goldberg@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nadi2017finegrained,\ntitle={Fine-grained Analysis of Sentence Embeddings Using Auxiliary Prediction Tasks},\nauthor={Yossi Adi and Einat Kermany and Yonatan Belinkov and Ofer Lavi and Yoav Goldberg},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJh6Ztuxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJh6Ztuxl", "pdf_size": 0, "rating": "8;8;8", "confidence": "4;5;4", "rating_avg": 8.0, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 664, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6309693306335821652&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14 }, { "id": "BJhZeLsxx", "title": "What does it take to generate natural textures?", "track": "main", "status": "Poster", "tldr": "Natural textures of high perceptual quality can be generated from networks with only a single layer, no pooling and random filters.", "abstract": "Natural image generation is currently one of the most actively explored fields in Deep Learning. Many approaches, e.g. for state-of-the-art artistic style transfer or natural texture synthesis, rely on the statistics of hierarchical representations in supervisedly trained deep neural networks. It is, however, unclear what aspects of this feature representation are crucial for natural image generation: is it the depth, the pooling or the training of the features on natural images? We here address this question for the task of natural texture synthesis and show that none of the above aspects are indispensable. Instead, we demonstrate that natural textures of high perceptual quality can be generated from networks with only a single layer, no pooling and random filters.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Ivan Ustyuzhaninov *;Wieland Brendel *;Leon Gatys;Matthias Bethge", "authorids": "ivan.ustyuzhaninov@bethgelab.org;wieland.brendel@bethgelab.org;leon.gatys@bethgelab.org;matthias.bethge@bethgelab.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\n*2017what,\ntitle={What does it take to generate natural textures?},\nauthor={Ivan Ustyuzhaninov * and Wieland Brendel * and Leon Gatys and Matthias Bethge},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJhZeLsxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJhZeLsxx", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;3;5", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15531049356447996546&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BJjn-Yixl", "title": "Attentive Recurrent Comparators", "track": "main", "status": "Reject", "tldr": "Attention and Recurrence can be as good as Convolution in some cases. Bigger returns when we combine all three.", "abstract": "Attentive Recurrent Comparators (ARCs) are a novel class of neural networks built with attention and recurrence that learn to estimate the similarity of a set of objects by cycling through them and making observations. The observations made in one object are conditioned on the observations made in all the other objects. This allows ARCs to learn to focus on the salient aspects needed to ascertain similarity. Our simplistic model that does not use any convolutions performs comparably to Deep Convolutional Siamese Networks on various visual tasks. However using ARCs and convolutional feature extractors in conjunction produces a model that is significantly better than any other method and has superior generalization capabilities. On the Omniglot dataset, ARC based models achieve an error rate of 1.5\\% in the One-Shot classification task - a 2-3x reduction compared to the previous best models. This is also the first Deep Learning model to outperform humans (4.5\\%) and surpass the state of the art accuracy set by the highly specialized Hierarchical Bayesian Program Learning (HBPL) system (3.3\\%).", "keywords": "Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Pranav Shyam;Ambedkar Dukkipati", "authorids": "pranavm.cs13@rvce.edu.in;ad@csa.iisc.ernet.in", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nshyam2017attentive,\ntitle={Attentive Recurrent Comparators},\nauthor={Pranav Shyam and Ambedkar Dukkipati},\nyear={2017},\nurl={https://openreview.net/forum?id=BJjn-Yixl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJjn-Yixl", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;5;2", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 161, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16094580685287102974&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "BJluGHcee", "title": "Tensorial Mixture Models", "track": "main", "status": "Reject", "tldr": "A generative model realized through convolutional networks, which has the unique property of having both tractable inference and marginalization, showing state-of-the-art results on classification with missing data.", "abstract": "We introduce a generative model, we call Tensorial Mixture Models (TMMs) based on mixtures of basic\ncomponent distributions over local structures (e.g. patches in an image) where the dependencies between\nthe local-structures are represented by a \"priors tensor\" holding the prior probabilities of assigning a\ncomponent distribution to each local-structure.\n\nIn their general form, TMMs are intractable as the priors tensor is typically of exponential size. However,\nwhen the priors tensor is decomposed it gives rise to an arithmetic circuit which in turn transforms the\nTMM into a Convolutional Arithmetic Circuit (ConvAC). A ConvAC corresponds to a shallow (single hidden layer)\nnetwork when the priors tensor is decomposed by a CP (sum of rank-1) approach and corresponds to a\ndeep network when the decomposition follows the Hierarchical Tucker (HT) model.\n\nThe ConvAC representation of a TMM possesses several attractive properties. First, the inference is tractable\nand is implemented by a forward pass through a deep network. Second, the architectural design of the model\nfollows the deep networks community design, i.e., the structure of TMMs is determined by just two easily\nunderstood factors: size of pooling windows and number of channels. Finally, we demonstrate the effectiveness\nof our model when tackling the problem of classification with missing data, leveraging TMMs unique ability of\ntractable marginalization which leads to optimal classifiers regardless of the missingness distribution.", "keywords": "Deep learning;Supervised Learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Or Sharir;Ronen Tamari;Nadav Cohen;Amnon Shashua", "authorids": "or.sharir@cs.huji.ac.il;ronent@cs.huji.ac.il;cohennadav@cs.huji.ac.il;shashua@cs.huji.ac.il", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsharir2017tensorial,\ntitle={Tensorial Mixture Models},\nauthor={Or Sharir and Ronen Tamari and Nadav Cohen and Amnon Shashua},\nyear={2017},\nurl={https://openreview.net/forum?id=BJluGHcee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=BJluGHcee", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4763897973445892564&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "BJlxmAKlg", "title": "ReasoNet: Learning to Stop Reading in Machine Comprehension", "track": "main", "status": "Reject", "tldr": "ReasoNet Reader for machine reading and comprehension", "abstract": "Teaching a computer to read a document and answer general questions pertaining to the document is a challenging yet unsolved problem. In this paper, we describe a novel neural network architecture called Reasoning Network ({ReasoNet}) for machine comprehension tasks. ReasoNet makes use of multiple turns to effectively exploit and then reason over the relation among queries, documents, and answers. Different from previous approaches using a fixed number of turns during inference, ReasoNet introduces a termination state to relax this constraint on the reasoning depth. With the use of reinforcement learning, ReasoNet can dynamically determine whether to continue the comprehension process after digesting intermediate results, or to terminate reading when it concludes that existing information is adequate to produce an answer. ReasoNet has achieved state-of-the-art performance in machine comprehension datasets, including unstructured CNN and Daily Mail datasets, and a structured Graph Reachability dataset. \n", "keywords": "Deep learning;Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Yelong Shen;Po-Sen Huang;Jianfeng Gao;Weizhu Chen", "authorids": "yeshen@microsoft.com;pshuang@microsoft.com;jfgao@microsoft.com;wzchen@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshen2017reasonet,\ntitle={ReasoNet: Learning to Stop Reading in Machine Comprehension},\nauthor={Yelong Shen and Po-Sen Huang and Jianfeng Gao and Weizhu Chen},\nyear={2017},\nurl={https://openreview.net/forum?id=BJlxmAKlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJlxmAKlg", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;5", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 1.0, "gs_citation": 341, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11064907095972429113&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "BJm4T4Kgx", "title": "Adversarial Machine Learning at Scale", "track": "main", "status": "Poster", "tldr": "", "abstract": "Adversarial examples are malicious inputs designed to fool machine learning models.\nThey often transfer from one model to another, allowing attackers to mount black\nbox attacks without knowledge of the target model's parameters.\nAdversarial training is the process of explicitly training a model on adversarial\nexamples, in order to make it more robust to attack or to reduce its test error\non clean inputs.\nSo far, adversarial training has primarily been applied to small problems.\nIn this research, we apply adversarial training to ImageNet.\nOur contributions include:\n(1) recommendations for how to succesfully scale adversarial training to large models and datasets,\n(2) the observation that adversarial training confers robustness to single-step attack methods,\n(3) the finding that multi-step attack methods are somewhat less transferable than single-step attack\n methods, so single-step attacks are the best for mounting black-box attacks,\n and\n(4) resolution of a ``label leaking'' effect that causes adversarially trained models to perform\n better on adversarial examples than on clean examples, because the adversarial\n example construction process uses the true label and the model can learn to\n exploit regularities in the construction process.\n", "keywords": "Computer vision;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Alexey Kurakin;Ian J. Goodfellow;Samy Bengio", "authorids": "kurakin@google.com;ian@openai.com;bengio@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkurakin2017adversarial,\ntitle={Adversarial Machine Learning at Scale},\nauthor={Alexey Kurakin and Ian J. Goodfellow and Samy Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJm4T4Kgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJm4T4Kgx", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 4015, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8221212997031548134&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15 }, { "id": "BJmCKBqgl", "title": "DyVEDeep: Dynamic Variable Effort Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Neural Networks (DNNs) have advanced the state-of-the-art on a variety of machine learning tasks and are deployed widely in many real-world products. However, the compute and data requirements demanded by large-scale DNNs remains a significant challenge. In this work, we address this challenge in the context of DNN inference. We propose Dynamic Variable Effort Deep Neural Networks (DyVEDeep), which exploit the heterogeneity in the characteristics of inputs to DNNs to improve their compute efficiency while maintaining the same classification accuracy. DyVEDeep equips DNNs with dynamic effort knobs, which in course of processing an input, identify how critical a group of computations are to classify the input. DyVEDeep dynamically focuses its compute effort only on the critical computations, while the skipping/approximating the rest. We propose 3 effort knobs that operate at different levels of granularity viz. neuron, feature and layer levels. We build DyVEDeep versions for 5 popular image recognition benchmarks on 3 image datasets---MNIST, CIFAR and ImageNet. Across all benchmarks, DyVEDeep achieves 2.1X-2.6X reduction in number of scalar operations, which translates to 1.9X-2.3X performance improvement over a Caffe-based sequential software implementation, for negligible loss in accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sanjay Ganapathy;Swagath Venkataramani;Balaraman Ravindran;Anand Raghunathan", "authorids": "sanjaygana@gmail.com;venkata0@purdue.edu;ravi@cse.iitm.ac.in;raghunathan@purdue.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nganapathy2017dyvedeep,\ntitle={Dy{VED}eep: Dynamic Variable Effort Deep Neural Networks},\nauthor={Sanjay Ganapathy and Swagath Venkataramani and Balaraman Ravindran and Anand Raghunathan},\nyear={2017},\nurl={https://openreview.net/forum?id=BJmCKBqgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJmCKBqgl", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14023226423879584615&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BJrFC6ceg", "title": "PixelCNN++: Improving the PixelCNN with Discretized Logistic Mixture Likelihood and Other Modifications", "track": "main", "status": "Poster", "tldr": "Adding discretized logistic mixture Likelihood and other modifications to PixelCNN improves performance.", "abstract": "PixelCNNs are a recently proposed class of powerful generative models with tractable likelihood. Here we discuss our implementation of PixelCNNs which we make available at https://github.com/openai/pixel-cnn. Our implementation contains a number of modifications to the original model that both simplify its structure and improve its performance. 1) We use a discretized logistic mixture likelihood on the pixels, rather than a 256-way softmax, which we find to speed up training. 2) We condition on whole pixels, rather than R/G/B sub-pixels, simplifying the model structure. 3) We use downsampling to efficiently capture structure at multiple resolutions. 4) We introduce additional short-cut connections to further speed up optimization. 5) We regularize the model using dropout. Finally, we present state-of-the-art log likelihood results on CIFAR-10 to demonstrate the usefulness of these modifications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tim Salimans;Andrej Karpathy;Xi Chen;Diederik P. Kingma", "authorids": "tim@openai.com;karpathy@openai.com;peter@openai.com;dpkingma@openai.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsalimans2017pixelcnn,\ntitle={Pixel{CNN}++: Improving the Pixel{CNN} with Discretized Logistic Mixture Likelihood and Other Modifications},\nauthor={Tim Salimans and Andrej Karpathy and Xi Chen and Diederik P. Kingma},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJrFC6ceg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJrFC6ceg", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;5;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1259, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3764972270987352239&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4 }, { "id": "BJtNZAFgg", "title": "Adversarial Feature Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "The ability of the Generative Adversarial Networks (GANs) framework to learn generative models mapping from simple latent distributions to arbitrarily complex data distributions has been demonstrated empirically, with compelling results showing generators learn to \"linearize semantics\" in the latent space of such models. Intuitively, such latent spaces may serve as useful feature representations for auxiliary problems where semantics are relevant. However, in their existing form, GANs have no means of learning the inverse mapping -- projecting data back into the latent space. We propose Bidirectional Generative Adversarial Networks (BiGANs) as a means of learning this inverse mapping, and demonstrate that the resulting learned feature representation is useful for auxiliary supervised discrimination tasks, competitive with contemporary approaches to unsupervised and self-supervised feature learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jeff Donahue;Philipp Kr\u00e4henb\u00fchl;Trevor Darrell", "authorids": "jdonahue@cs.berkeley.edu;philkr@utexas.edu;trevor@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndonahue2017adversarial,\ntitle={Adversarial Feature Learning},\nauthor={Jeff Donahue and Philipp Kr{\\\"a}henb{\\\"u}hl and Trevor Darrell},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BJtNZAFgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJtNZAFgg", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2673, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10661655492543733137&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "BJuysoFeg", "title": "Revisiting Batch Normalization For Practical Domain Adaptation", "track": "main", "status": "Reject", "tldr": "We propose a simple yet effective approach for domain adaptation on batch normalized neural networks.", "abstract": "Deep neural networks (DNN) have shown unprecedented success in various computer vision applications such as image classification and object detection. However, it is still a common annoyance during the training phase, that one has to prepare at least thousands of labeled images to fine-tune a network to a specific domain. Recent study shows that a DNN has strong dependency towards the training dataset, and the learned features cannot be easily transferred to a different but relevant task without fine-tuning. In this paper, we propose a simple yet powerful remedy, called Adaptive Batch Normalization (AdaBN) to increase the generalization ability of a DNN. By modulating the statistics from the source domain to the target domain in all Batch Normalization layers across the network, our approach achieves deep adaptation effect for domain adaptation tasks. In contrary to other deep learning domain adaptation methods, our method does not require additional components, and is parameter-free. It archives state-of-the-art performance despite its surprising simplicity. Furthermore, we demonstrate that our method is complementary with other existing methods. Combining AdaBN with existing domain adaptation treatments may further improve model performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yanghao Li;Naiyan Wang;Jianping Shi;Jiaying Liu;Xiaodi Hou", "authorids": "lyttonhao@pku.edu.cn;winsty@gmail.com;shijianping5000@gmail.com;liujiaying@pku.edu.cn;xiaodi.hou@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2017revisiting,\ntitle={Revisiting Batch Normalization For Practical Domain Adaptation},\nauthor={Yanghao Li and Naiyan Wang and Jianping Shi and Jiaying Liu and Xiaodi Hou},\nyear={2017},\nurl={https://openreview.net/forum?id=BJuysoFeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJuysoFeg", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 883, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11679251260326951806&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "BJwFrvOeg", "title": "A Neural Knowledge Language Model", "track": "main", "status": "Reject", "tldr": "A neural recurrent language model which can extract knowledge from a knowledge base to generate knowledge related words such as person names, locations, years, etc.", "abstract": "Current language models have significant limitations in their ability to encode and decode knowledge. This is mainly because they acquire knowledge based on statistical co-occurrences, even if most of the knowledge words are rarely observed named entities. In this paper, we propose a Neural Knowledge Language Model (NKLM) which combines symbolic knowledge provided by a knowledge graph with the RNN language model. At each time step, the model predicts a fact on which the observed word is to be based. Then, a word is either generated from the vocabulary or copied from the knowledge graph. We train and test the model on a new dataset, WikiFacts. In experiments, we show that the NKLM significantly improves the perplexity while generating a much smaller number of unknown words. In addition, we demonstrate that the sampled descriptions include named entities which were used to be the unknown words in RNN language models.\n", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Sungjin Ahn;Heeyoul Choi;Tanel Parnamaa;Yoshua Bengio", "authorids": "sjn.ahn@gmail.com;heeyoul@gmail.com;tanel.parnamaa@gmail.com;yoshua.bengio@umontreal.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nahn2017a,\ntitle={A Neural Knowledge Language Model},\nauthor={Sungjin Ahn and Heeyoul Choi and Tanel Parnamaa and Yoshua Bengio},\nyear={2017},\nurl={https://openreview.net/forum?id=BJwFrvOeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJwFrvOeg", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 156, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6251488455740325204&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "BJxhLAuxg", "title": "A Deep Learning Approach for Joint Video Frame and Reward Prediction in Atari Games", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reinforcement learning is concerned with learning to interact with environments that are initially unknown. State-of-the-art reinforcement learning approaches, such as DQN, are model-free and learn to act effectively across a wide range of environments such as Atari games, but require huge amounts of data. Model-based techniques are more data-efficient, but need to acquire explicit knowledge about the environment dynamics or the reward structure. \n\nIn this paper we take a step towards using model-based techniques in environments with high-dimensional visual state space when system dynamics and the reward structure are both unknown and need to be learned, by demonstrating that it is possible to learn both jointly.\nEmpirical evaluation on five Atari games demonstrate accurate cumulative reward prediction of up to 200 frames. We consider these positive results as opening up important directions for model-based RL in complex, initially unknown environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Felix Leibfried;Nate Kushman;Katja Hofmann", "authorids": "felix.leibfried@gmail.com;nkushman@microsoft.com;katja.hofmann@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nleibfried2017a,\ntitle={A Deep Learning Approach for Joint Video Frame and Reward Prediction in Atari Games},\nauthor={Felix Leibfried and Nate Kushman and Katja Hofmann},\nyear={2017},\nurl={https://openreview.net/forum?id=BJxhLAuxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJxhLAuxg", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9791958591967948071&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "Bk0FWVcgx", "title": "Topology and Geometry of Half-Rectified Network Optimization", "track": "main", "status": "Poster", "tldr": "We provide theoretical, algorithmical and experimental results concerning the optimization landscape of deep neural networks", "abstract": "The loss surface of deep neural networks has recently attracted interest \nin the optimization and machine learning communities as a prime example of \nhigh-dimensional non-convex problem. Some insights were recently gained using spin glass \nmodels and mean-field approximations, but at the expense of strongly simplifying the nonlinear nature of the model.\n\nIn this work, we do not make any such approximation and study conditions \non the data distribution and model architecture that prevent the existence \nof bad local minima. Our theoretical work quantifies and formalizes two \nimportant folklore facts: (i) the landscape of deep linear networks has a radically different topology \nfrom that of deep half-rectified ones, and (ii) that the energy landscape \nin the non-linear case is fundamentally controlled by the interplay between the smoothness of the data distribution and model over-parametrization. Our main theoretical contribution is to prove that half-rectified single layer networks are asymptotically connected, and we provide explicit bounds that reveal the aforementioned interplay.\n\nThe conditioning of gradient descent is the next challenge we address. \nWe study this question through the geometry of the level sets, and we introduce\nan algorithm to efficiently estimate the regularity of such sets on large-scale networks. \nOur empirical results show that these level sets remain connected throughout \nall the learning phase, suggesting a near convex behavior, but they become \nexponentially more curvy as the energy level decays, in accordance to what is observed in practice with \nvery low curvature attractors.", "keywords": "Theory;Deep learning", "primary_area": "", "supplementary_material": "", "author": "C. Daniel Freeman;Joan Bruna", "authorids": "daniel.freeman@berkeley.edu;bruna@cims.nyu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nfreeman2017topology,\ntitle={Topology and Geometry of Half-Rectified Network Optimization},\nauthor={C. Daniel Freeman and Joan Bruna},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Bk0FWVcgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Bk0FWVcgx", "pdf_size": 0, "rating": "2;7;8", "confidence": "5;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 20, "authors#_avg": 2, "corr_rating_confidence": -0.9878291611472622, "gs_citation": 278, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5710196688640567330&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "Bk0MRI5lg", "title": "Bridging Nonlinearities and Stochastic Regularizers with Gaussian Error Linear Units", "track": "main", "status": "Reject", "tldr": "A Competitor of ReLUs and ELUs with a Probabilistic Underpinning", "abstract": "We propose the Gaussian Error Linear Unit (GELU), a high-performing neural network activation function. The GELU nonlinearity is the expected transformation of a stochastic regularizer which randomly applies the identity or zero map to a neuron's input. This stochastic regularizer is comparable to nonlinearities aided by dropout, but it removes the need for a traditional nonlinearity. The connection between the GELU and the stochastic regularizer suggests a new probabilistic understanding of nonlinearities. We perform an empirical evaluation of the GELU nonlinearity against the ReLU and ELU activations and find performance improvements across all tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dan Hendrycks;Kevin Gimpel", "authorids": "dan@ttic.edu;kgimpel@ttic.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhendrycks2017bridging,\ntitle={Bridging Nonlinearities and Stochastic Regularizers with Gaussian Error Linear Units},\nauthor={Dan Hendrycks and Kevin Gimpel},\nyear={2017},\nurl={https://openreview.net/forum?id=Bk0MRI5lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Bk0MRI5lg", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "Bk2TqVcxe", "title": "Discovering objects and their relations from entangled scene representations", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Our world can be succinctly and compactly described as structured scenes of objects and relations. A typical room, for example, contains salient objects such as tables, chairs and books, and these objects typically relate to each other by virtue of their correlated features, such as position, function and shape. Humans exploit knowledge of objects and their relations for learning a wide spectrum of tasks, and more generally when learning the structure underlying observed data. In this work, we introduce relation networks (RNs) - a general purpose neural network architecture for object-relation reasoning. We show that RNs are capable of learning object relations from scene description data. Furthermore, we show that RNs can act as a bottleneck that induces the factorization of objects from entangled scene description inputs, and from distributed deep representations of scene images provided by a variational autoencoder. The model can also be used in conjunction with differentiable memory mechanisms for implicit relation discovery in one-shot learning tasks. Our results suggest that relation networks are a powerful architecture for solving a variety of problems that require object relation reasoning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Raposo;Adam Santoro;David Barrett;Razvan Pascanu;Timothy Lillicrap;Peter Battaglia", "authorids": "draposo@google.com;adamsantoro@google.com;barrettdavid@google.com;razp@google.com;countzero@google.com;peterbattaglia@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Bk2TqVcxe", "pdf_size": 0, "rating": "3;7;7", "confidence": "5;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": -1.0, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17075890751298607392&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "Bk3F5Y9lx", "title": "Epitomic Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "We introduce an extension of variational autoencoders that learns multiple shared latent subspaces to address the issue of model capacity underutilization.", "abstract": "In this paper, we propose epitomic variational autoencoder (eVAE), a probabilistic generative model of high dimensional data. eVAE is composed of a number of sparse variational autoencoders called `epitome' such that each epitome partially shares its encoder-decoder architecture with other epitomes in the composition. We show that the proposed model greatly overcomes the common problem in variational autoencoders (VAE) of model over-pruning. We substantiate that eVAE is efficient in using its model capacity and generalizes better than VAE, by presenting qualitative and quantitative results on MNIST and TFD datasets.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Serena Yeung;Anitha Kannan;Yann Dauphin;Li Fei-Fei", "authorids": "serena@cs.stanford.edu;akannan@fb.com;ynd@fb.com;feifeili@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyeung2017epitomic,\ntitle={Epitomic Variational Autoencoders},\nauthor={Serena Yeung and Anitha Kannan and Yann Dauphin and Li Fei-Fei},\nyear={2017},\nurl={https://openreview.net/forum?id=Bk3F5Y9lx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer5", "site": "https://openreview.net/forum?id=Bk3F5Y9lx", "pdf_size": 0, "rating": "4;5;8", "confidence": "5;5;5", "rating_avg": 5.666666666666667, "confidence_avg": 5.0, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=289914084536331543&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "Bk67W4Yxl", "title": "Improved Architectures for Computer Go", "track": "main", "status": "Reject", "tldr": "Improving training of deep networks for computer Go modifying the layers", "abstract": "AlphaGo trains policy networks with both supervised and reinforcement learning and makes different policy networks play millions of games so as to train a value network. The reinforcement learning part requires massive ammount of computation. We propose to train networks for computer Go so that given accuracy is reached with much less examples. We modify the architecture of the networks in order to train them faster and to have better accuracy in the end.", "keywords": "Games;Supervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Tristan Cazenave", "authorids": "Tristan.Cazenave@dauphine.fr", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ncazenave2017improved,\ntitle={Improved Architectures for Computer Go},\nauthor={Tristan Cazenave},\nyear={2017},\nurl={https://openreview.net/forum?id=Bk67W4Yxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer6;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Bk67W4Yxl", "pdf_size": 0, "rating": "3;3;4;7", "confidence": "4;4;4;5", "rating_avg": 4.25, "confidence_avg": 4.25, "replies_avg": 9, "authors#_avg": 1, "corr_rating_confidence": 0.9684959969581861, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "Bk8BvDqex", "title": "Metacontrol for Adaptive Imagination-Based Optimization", "track": "main", "status": "Poster", "tldr": "We present a \"metacontroller\" neural architecture which can adaptively decide how long to run an model-based online optimization procedure for, and which models to use during the optimization.", "abstract": "Many machine learning systems are built to solve the hardest examples of a particular task, which often makes them large and expensive to run---especially with respect to the easier examples, which might require much less computation. For an agent with a limited computational budget, this \"one-size-fits-all\" approach may result in the agent wasting valuable computation on easy examples, while not spending enough on hard examples. Rather than learning a single, fixed policy for solving all instances of a task, we introduce a metacontroller which learns to optimize a sequence of \"imagined\" internal simulations over predictive models of the world in order to construct a more informed, and more economical, solution. The metacontroller component is a model-free reinforcement learning agent, which decides both how many iterations of the optimization procedure to run, as well as which model to consult on each iteration. The models (which we call \"experts\") can be state transition models, action-value functions, or any other mechanism that provides information useful for solving the task, and can be learned on-policy or off-policy in parallel with the metacontroller. When the metacontroller, controller, and experts were trained with \"interaction networks\" (Battaglia et al., 2016) as expert models, our approach was able to solve a challenging decision-making problem under complex non-linear dynamics. The metacontroller learned to adapt the amount of computation it performed to the difficulty of the task, and learned how to choose which experts to consult by factoring in both their reliability and individual computational resource costs. This allowed the metacontroller to achieve a lower overall cost (task loss plus computational cost) than more traditional fixed policy approaches. These results demonstrate that our approach is a powerful framework for using rich forward models for efficient model-based reinforcement learning.", "keywords": "Deep learning;Reinforcement Learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Jessica B. Hamrick;Andrew J. Ballard;Razvan Pascanu;Oriol Vinyals;Nicolas Heess;Peter W. Battaglia", "authorids": "jhamrick@berkeley.edu;aybd@google.com;razp@google.com;vinyals@google.com;heess@google.com;peterbattaglia@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nhamrick2017metacontrol,\ntitle={Metacontrol for Adaptive Imagination-Based Optimization},\nauthor={Jessica B. Hamrick and Andrew J. Ballard and Razvan Pascanu and Oriol Vinyals and Nicolas Heess and Peter W. Battaglia},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Bk8BvDqex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer5;AnonReviewer1", "site": "https://openreview.net/forum?id=Bk8BvDqex", "pdf_size": 0, "rating": "7;8;8;8", "confidence": "3;3;3;3", "rating_avg": 7.75, "confidence_avg": 3.0, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16728474512617398730&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5 }, { "id": "Bk8N0RLxx", "title": "Vocabulary Selection Strategies for Neural Machine Translation", "track": "main", "status": "Reject", "tldr": "Neural machine translation can reach same accuracy with a 10x speedup by pruning the vocabulary prior to decoding.", "abstract": "Classical translation models constrain the space of possible outputs by selecting a subset of translation rules based on the input sentence. Recent work on improving the efficiency of neural translation models adopted a similar strategy by restricting the output vocabulary to a subset of likely candidates given the source. In this paper we experiment with context and embedding-based selection methods and extend previous work by examining speed and accuracy trade-offs in more detail. We show that decoding time on CPUs can be reduced by up to 90% and training time by 25% on the WMT15 English-German and WMT16 English-Romanian tasks at the same or only negligible change in accuracy. This brings the time to decode with a state of the art neural translation system to just over 140 words per seconds on a single CPU core for English-German.", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Gurvan L'Hostis;David Grangier;Michael Auli", "authorids": "gurvan.lhostis@polytechnique.edu;grangier@fb.com;michaelauli@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nl'hostis2017vocabulary,\ntitle={Vocabulary Selection Strategies for Neural Machine Translation},\nauthor={Gurvan L'Hostis and David Grangier and Michael Auli},\nyear={2017},\nurl={https://openreview.net/forum?id=Bk8N0RLxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer5;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=Bk8N0RLxx", "pdf_size": 0, "rating": "4;4;5;5", "confidence": "4;5;3;3", "rating_avg": 4.5, "confidence_avg": 3.75, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.9045340337332909, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16564655210040450034&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "Bk8aOm9xl", "title": "Surprise-Based Intrinsic Motivation for Deep Reinforcement Learning", "track": "main", "status": "Workshop", "tldr": "Learn a dynamics model and use it to make your agent boldly go where it has not gone before.", "abstract": "Exploration in complex domains is a key challenge in reinforcement learning, especially for tasks with very sparse rewards. Recent successes in deep reinforcement learning have been achieved mostly using simple heuristic exploration strategies such as $\\epsilon$-greedy action selection or Gaussian control noise, but there are many tasks where these methods are insufficient to make any learning progress. Here, we consider more complex heuristics: efficient and scalable exploration strategies that maximize a notion of an agent's surprise about its experiences via intrinsic motivation. We propose to learn a model of the MDP transition probabilities concurrently with the policy, and to form intrinsic rewards that approximate the KL-divergence of the true transition probabilities from the learned model. One of our approximations results in using surprisal as intrinsic motivation, while the other gives the $k$-step learning progress. We show that our incentives enable agents to succeed in a wide range of environments with high-dimensional state spaces and very sparse rewards, including continuous control tasks and games in the Atari RAM domain, outperforming several other heuristic exploration techniques. \n", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Joshua Achiam;Shankar Sastry", "authorids": "jachiam@berkeley.edu;sastry@coe.berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Bk8aOm9xl", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 301, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10356749480761047266&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "BkCPyXm1l", "title": "SoftTarget Regularization: An Effective Technique to Reduce Over-Fitting in Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks are learning models with a very high capacity and therefore prone to over-fitting. Many regularization techniques such as Dropout, DropConnect, and weight decay all attempt to solve the problem of over-fitting by reducing the capacity of their respective models (Srivastava et al., 2014), (Wan et al., 2013), (Krogh & Hertz, 1992). In this paper we introduce a new form of regularization that guides the learning problem in a way that reduces over-fitting without sacrificing the capacity of the model. The mistakes that models make in early stages of training carry information about the learning problem. By adjusting the labels of the current epoch of training through a weighted average of the real labels, and an exponential average of the past soft-targets we achieved a regularization scheme as powerful as Dropout without necessarily reducing the capacity of the model, and simplified the complexity of the learning problem. SoftTarget regularization proved to be an effective tool in various neural network architectures.", "keywords": "Deep learning;Optimization;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Armen Aghajanyan", "authorids": "armen.ag@live.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\naghajanyan2017softtarget,\ntitle={SoftTarget Regularization: An Effective Technique to Reduce Over-Fitting in Neural Networks},\nauthor={Armen Aghajanyan},\nyear={2017},\nurl={https://openreview.net/forum?id=BkCPyXm1l}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BkCPyXm1l", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;5;5", "rating_avg": 3.6666666666666665, "confidence_avg": 5.0, "replies_avg": 20, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5192729270698671724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BkGakb9lx", "title": "RenderGAN: Generating Realistic Labeled Data", "track": "main", "status": "Workshop", "tldr": "We embed a 3D model in the GAN framework to generate realistic, labeled data.", "abstract": "Deep Convolutional Neuronal Networks (DCNNs) are showing remarkable performance on many computer vision tasks. Due to their large parameter space, they require many labeled samples when trained in a supervised setting. The costs of annotating data manually can render the use of DCNNs infeasible. We present a novel framework called RenderGAN that can generate large amounts of realistic, labeled images by combining a 3D model and the Generative Adversarial Network framework. In our approach, image augmentations (e.g. lighting, background, and detail) are learned from unlabeled data such that the generated images are strikingly realistic while preserving the labels known from the 3D model. We apply the RenderGAN framework to generate images of barcode-like markers that are attached to honeybees. Training a DCNN on data generated by the RenderGAN yields considerably better performance than training it on various baselines. ", "keywords": "Unsupervised Learning;Computer vision;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Leon Sixt;Benjamin Wild;Tim Landgraf", "authorids": "leon.sixt@fu-berlin.de;benjamin.wild@fu-berlin.de;tim.landgraf@fu-berlin.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkGakb9lx", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 216, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15927402985021811547&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 17 }, { "id": "BkIqod5ll", "title": "Convolutional Neural Networks Generalization Utilizing the Data Graph Structure", "track": "main", "status": "Reject", "tldr": "A generalization of CNNs to standard regression and classification problems by using random walk on the data graph structure.", "abstract": "Convolutional Neural Networks have proved to be very efficient in image and audio processing. Their success is mostly attributed to the convolutions which utilize the geometric properties of a low - dimensional grid structure. This paper suggests a generalization of CNNs to graph-structured data with varying graph structure, that can be applied to standard regression or classification problems by learning the graph structure of the data. We propose a novel convolution framework approach on graphs which utilizes a random walk to select relevant nodes. The convolution shares weights on all features, providing the desired parameter efficiency. Furthermore, the additional computations in the training process are only executed once in the pre-processing step. We empirically demonstrate the performance of the proposed CNN on MNIST data set, and challenge the state-of-the-art on Merck molecular activity data set.", "keywords": "Supervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Yotam Hechtlinger;Purvasha Chakravarti;Jining Qin", "authorids": "yhechtli@andrew.cmu.edu;pchakrav@andrew.cmu.edu;jiningq@andrew.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhechtlinger2017convolutional,\ntitle={Convolutional Neural Networks Generalization Utilizing the Data Graph Structure},\nauthor={Yotam Hechtlinger and Purvasha Chakravarti and Jining Qin},\nyear={2017},\nurl={https://openreview.net/forum?id=BkIqod5ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BkIqod5ll", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;3;3", "rating_avg": 4.0, "confidence_avg": 2.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.5000000000000001, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5623798728517450009&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "BkJsCIcgl", "title": "The Predictron: End-To-End Learning and Planning", "track": "main", "status": "Reject", "tldr": "", "abstract": "One of the key challenges of artificial intelligence is to learn models that are effective in the context of planning. In this document we introduce the predictron architecture. The predictron consists of a fully abstract model, represented by a Markov reward process, that can be rolled forward multiple \"imagined\" planning steps. Each forward pass of the predictron accumulates internal rewards and values over multiple planning depths. \nThe predictron is trained end-to-end so as to make these accumulated values accurately approximate the true value function, thereby focusing the model upon the aspects of the environment most relevant to planning. We applied the predictron to procedurally generated random mazes and a simulator for the game of pool. The predictron yielded significantly more accurate predictions than conventional deep neural network architectures.", "keywords": "Deep learning;Reinforcement Learning;Supervised Learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "David Silver;Hado van Hasselt;Matteo Hessel;Tom Schaul;Arthur Guez;Tim Harley;Gabriel Dulac-Arnold;David Reichert;Neil Rabinowitz;Andre Barreto;Thomas Degris", "authorids": "davidsilver@google.com;hado@google.com;mtthss@google.com;schaul@google.com;aguez@google.com;tharley@google.com;dulacarnold@google.com;reichert@google.com;ncr@google.com;andrebarreto@google.com;degris@google.com", "gender": ";;;;;;;;;;", "homepage": ";;;;;;;;;;", "dblp": ";;;;;;;;;;", "google_scholar": ";;;;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": ";;;;;;;;;;", "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "position": ";;;;;;;;;;", "bibtex": "@misc{\nsilver2017the,\ntitle={The Predictron: End-To-End Learning and Planning},\nauthor={David Silver and Hado van Hasselt and Matteo Hessel and Tom Schaul and Arthur Guez and Tim Harley and Gabriel Dulac-Arnold and David Reichert and Neil Rabinowitz and Andre Barreto and Thomas Degris},\nyear={2017},\nurl={https://openreview.net/forum?id=BkJsCIcgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=BkJsCIcgl", "pdf_size": 0, "rating": "4;6;9", "confidence": "4;5;2", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 21, "authors#_avg": 11, "corr_rating_confidence": -0.7370434740955021, "gs_citation": 326, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=123025585147889247&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "BkLhzHtlg", "title": "Learning Recurrent Representations for Hierarchical Behavior Modeling", "track": "main", "status": "Poster", "tldr": "", "abstract": "We propose a framework for detecting action patterns from motion sequences and modeling the sensory-motor relationship of animals, using a generative recurrent neural network. The network has a discriminative part (classifying actions) and a generative part (predicting motion), whose recurrent cells are laterally connected, allowing higher levels of the network to represent high level behavioral phenomena. We test our framework on two types of tracking data, fruit fly behavior and online handwriting. Our results show that 1) taking advantage of unlabeled sequences, by predicting future motion, significantly improves action detection performance when training labels are scarce, 2) the network learns to represent high level phenomena such as writer identity and fly gender, without supervision, and 3) simulated motion trajectories, generated by treating motion prediction as input to the network, look realistic and may be used to qualitatively evaluate whether the model has learnt generative control rules. ", "keywords": "Unsupervised Learning;Semi-Supervised Learning;Reinforcement Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Eyrun Eyjolfsdottir;Kristin Branson;Yisong Yue;Pietro Perona", "authorids": "eeyjolfs@caltech.edu;bransonk@janelia.hhmi.org;yyue@caltech.edu;perona@caltech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\neyjolfsdottir2017learning,\ntitle={Learning Recurrent Representations for Hierarchical Behavior Modeling},\nauthor={Eyrun Eyjolfsdottir and Kristin Branson and Yisong Yue and Pietro Perona},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BkLhzHtlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkLhzHtlg", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 21, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14003104177710030101&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "BkSmc8qll", "title": "Dynamic Neural Turing Machine with Continuous and Discrete Addressing Schemes", "track": "main", "status": "Reject", "tldr": "We propose a new type of Neural Turing Machine, which is simpler than the original model and achieves better results than the baselines on non-trivial tasks. ", "abstract": "In this paper, we extend neural Turing machine (NTM) into a dynamic neural Turing machine (D-NTM) by introducing a trainable memory addressing scheme. This addressing scheme maintains for each memory cell two separate vectors, content and address vectors. This allows the D-NTM to learn a wide variety of location-based addressing strategies including both linear and nonlinear ones. We implement the D-NTM with both continuous, differentiable and discrete, non-differentiable read/write mechanisms. We investigate the mechanisms and effects for learning to read and write to a memory through experiments on Facebook bAbI tasks using both a feedforward and GRU-controller. The D-NTM is evaluated on a set of Facebook bAbI tasks and shown to outperform NTM and LSTM baselines. We also provide further experimental results on sequential MNIST, associative recall and copy tasks.", "keywords": "Deep learning;Natural language processing;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Caglar Gulcehre;Sarath Chandar;Kyunghyun Cho;Yoshua Bengio", "authorids": "gulcehrc@iro.umontreal.ca;apsarathchandar@gmail.com;kyunghyun.cho@nyu.edu;yoshua.umontreal@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngulcehre2017dynamic,\ntitle={Dynamic Neural Turing Machine with Continuous and Discrete Addressing Schemes},\nauthor={Caglar Gulcehre and Sarath Chandar and Kyunghyun Cho and Yoshua Bengio},\nyear={2017},\nurl={https://openreview.net/forum?id=BkSmc8qll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=BkSmc8qll", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=339341030139268843&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "BkSqjHqxg", "title": "Skip-graph: Learning graph embeddings with an encoder-decoder model", "track": "main", "status": "Reject", "tldr": "An unsupervised method for generating graph feature representations based on the encoder-decoder model.", "abstract": "In this work, we study the problem of feature representation learning for graph-structured data. Many of the existing work in the area are task-specific and based on supervised techniques. We study a method for obtaining a generic feature representation for a graph using an unsupervised approach. The neural encoder-decoder model is a method that has been used in the natural language processing domain to learn feature representations of sentences. In our proposed approach, we train the encoder-decoder model to predict the random walk sequence of neighboring regions in a graph given a random walk along a particular region. The goal is to map subgraphs \u2014 as represented by their random walks \u2014 that are structurally and functionally similar to nearby locations in feature space. We evaluate the learned graph vectors using several real-world datasets on the graph classification task. The proposed model is able to achieve good results against state-of- the-art techniques.", "keywords": "Unsupervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "John Boaz Lee;Xiangnan Kong", "authorids": "jtlee@wpi.edu;xkong@wpi.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017skipgraph,\ntitle={Skip-graph: Learning graph embeddings with an encoder-decoder model},\nauthor={John Boaz Lee and Xiangnan Kong},\nyear={2017},\nurl={https://openreview.net/forum?id=BkSqjHqxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkSqjHqxg", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;1;3", "rating_avg": 6.0, "confidence_avg": 2.6666666666666665, "replies_avg": 18, "authors#_avg": 2, "corr_rating_confidence": -0.32732683535398854, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14901575348425187410&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "BkUDvt5gg", "title": "Wav2Letter: an End-to-End ConvNet-based Speech Recognition System", "track": "main", "status": "Reject", "tldr": "We propose convnet models and new sequence criterions for training end-to-end letter-based speech systems.", "abstract": "This paper presents a simple end-to-end model for speech recognition, combining a convolutional network based acoustic model and a graph decoding. It is trained to output letters, with transcribed speech, without the need for force alignment of phonemes. We introduce an automatic segmentation criterion for training from sequence annotation without alignment that is on par with CTC (Graves et al., 2006) while being simpler. We show competitive results in word error rate on the Librispeech corpus (Panayotov et al., 2015) with MFCC features, and promising results from raw waveform.", "keywords": "Deep learning;Speech;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Ronan Collobert;Christian Puhrsch;Gabriel Synnaeve", "authorids": "locronan@fb.com;cpuhrsch@fb.com;gab@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncollobert2017wavletter,\ntitle={Wav2Letter: an End-to-End ConvNet-based Speech Recognition System},\nauthor={Ronan Collobert and Christian Puhrsch and Gabriel Synnaeve},\nyear={2017},\nurl={https://openreview.net/forum?id=BkUDvt5gg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkUDvt5gg", "pdf_size": 0, "rating": "6;7", "confidence": "4;5", "rating_avg": 6.5, "confidence_avg": 4.5, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999999, "gs_citation": 379, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7029588050335776081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "BkV4VS9ll", "title": "The Incredible Shrinking Neural Network: New Perspectives on Learning Representations Through The Lens of Pruning", "track": "main", "status": "Reject", "tldr": "Pruning algorithms reveal fundamental insights into neural network learning representations", "abstract": "How much can pruning algorithms teach us about the fundamentals of learning representations in neural networks? A lot, it turns out. Neural network model compression has become a topic of great interest in recent years, and many different techniques have been proposed to address this problem. In general, this is motivated by the idea that smaller models typically lead to better generalization. At the same time, the decision of what to prune and when to prune necessarily forces us to confront our assumptions about how neural networks actually learn to represent patterns in data. In this work we set out to test several long-held hypotheses about neural network learning representations and numerical approaches to pruning. To accomplish this we first reviewed the historical literature and derived a novel algorithm to prune whole neurons (as opposed to the traditional method of pruning weights) from optimally trained networks using a second-order Taylor method. We then set about testing the performance of our algorithm and analyzing the quality of the decisions it made. As a baseline for comparison we used a first-order Taylor method based on the Skeletonization algorithm and an exhaustive brute-force serial pruning algorithm. Our proposed algorithm worked well compared to a first-order method, but not nearly as well as the brute-force method. Our error analysis led us to question the validity of many widely-held assumptions behind pruning algorithms in general and the trade-offs we often make in the interest of reducing computational complexity. We discovered that there is a straightforward way, however expensive, to serially prune 40-70\\% of the neurons in a trained network with minimal effect on the learning representation and without any re-training. ", "keywords": "Theory;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Nikolas Wolfe;Aditya Sharma;Lukas Drude;Bhiksha Raj", "authorids": "nwolfe@cs.cmu.edu;adityasharma@cmu.edu;drude@nt.upb.de;bhiksha@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwolfe2017the,\ntitle={The Incredible Shrinking Neural Network: New Perspectives on Learning Representations Through The Lens of Pruning},\nauthor={Nikolas Wolfe and Aditya Sharma and Lukas Drude and Bhiksha Raj},\nyear={2017},\nurl={https://openreview.net/forum?id=BkV4VS9ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BkV4VS9ll", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9976695568017833676&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "BkVsEMYel", "title": "Inductive Bias of Deep Convolutional Networks through Pooling Geometry", "track": "main", "status": "Poster", "tldr": "We study the ability of convolutional networks to model correlations among regions of their input, showing that this is controlled by shapes of pooling windows.", "abstract": "Our formal understanding of the inductive bias that drives the success of convolutional networks on computer vision tasks is limited. In particular, it is unclear what makes hypotheses spaces born from convolution and pooling operations so suitable for natural images. In this paper we study the ability of convolutional networks to model correlations among regions of their input. We theoretically analyze convolutional arithmetic circuits, and empirically validate our findings on other types of convolutional networks as well. Correlations are formalized through the notion of separation rank, which for a given partition of the input, measures how far a function is from being separable. We show that a polynomially sized deep network supports exponentially high separation ranks for certain input partitions, while being limited to polynomial separation ranks for others. The network's pooling geometry effectively determines which input partitions are favored, thus serves as a means for controlling the inductive bias. Contiguous pooling windows as commonly employed in practice favor interleaved partitions over coarse ones, orienting the inductive bias towards the statistics of natural images. Other pooling schemes lead to different preferences, and this allows tailoring the network to data that departs from the usual domain of natural imagery. In addition to analyzing deep networks, we show that shallow ones support only linear separation ranks, and by this gain insight into the benefit of functions brought forth by depth - they are able to efficiently model strong correlation under favored partitions of the input.", "keywords": "Theory;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Nadav Cohen;Amnon Shashua", "authorids": "cohennadav@cs.huji.ac.il;shashua@cs.huji.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ncohen2017inductive,\ntitle={Inductive Bias of Deep Convolutional Networks through Pooling Geometry},\nauthor={Nadav Cohen and Amnon Shashua},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BkVsEMYel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=BkVsEMYel", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;5;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 147, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7295097861408229778&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "BkXMikqxx", "title": "Cortical-Inspired Open-Bigram Representation for Handwritten Word Recognition", "track": "main", "status": "Reject", "tldr": "We propose an handwritten word recognition method based on an open-bigram representation of words, inspired from the research in cognitive psychology", "abstract": "Recent research in the cognitive process of reading hypothesized that we do\nnot read words by sequentially recognizing letters, but rather by identifing\nopen-bigrams, i.e. couple of letters that are not necessarily next\nto each other. \nIn this paper, we evaluate an handwritten word recognition method based on original\nopen-bigrams representation. We trained Long Short-Term Memory Recurrent Neural Networks\n(LSTM-RNNs) to predict open-bigrams rather than characters, and we show that\nsuch models are able to learn the long-range, complicated and intertwined dependencies\nin the input signal, necessary to the prediction. \nFor decoding, we decomposed each word of a large vocabulary into the set of\nconstituent bigrams, and apply a simple cosine similarity measure between this \nrepresentation and the bagged RNN prediction to retrieve the vocabulary word. \nWe compare this method to standard word recognition techniques based on \nsequential character recognition.\nExperiments are carried out on two public databases of handwritten words\n(Rimes and IAM), an the results with our bigram decoder are comparable \nto more conventional decoding methods based on sequences of letters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Th\u00e9odore Bluche;Christopher Kermorvant;Claude Touzet;Herv\u00e9 Glotin", "authorids": "tb@a2ia.com;kermorvant@teklia.com;claude.touzet@univ-amu.fr;glotin@univ-tln.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbluche2017corticalinspired,\ntitle={Cortical-Inspired Open-Bigram Representation for Handwritten Word Recognition},\nauthor={Th{\\'e}odore Bluche and Christopher Kermorvant and Claude Touzet and Herv{\\'e} Glotin},\nyear={2017},\nurl={https://openreview.net/forum?id=BkXMikqxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BkXMikqxx", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;5;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.7559289460184544, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18319999086989303223&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "Bk_zTU5eg", "title": "Inefficiency of stochastic gradient descent with larger mini-batches (and more learners)", "track": "main", "status": "Reject", "tldr": "We theoretically justify that increasing mini-batch size or increasing the number of learners can lead to slower SGD/ASGD convergence", "abstract": "Stochastic Gradient Descent (SGD) and its variants are the most important optimization algorithms used in large scale machine learning. Mini-batch version of stochastic gradient is often used in practice for taking advantage of hardware parallelism. In this work, we analyze the effect of mini-batch size over SGD convergence for the case of general non-convex objective functions. Building on the past analyses, we justify mathematically that there can often be a large difference between the convergence guarantees provided by small and large mini-batches (given each instance processes equal number of training samples), while providing experimental evidence for the same. Going further to distributed settings, we show that an analogous effect holds with popular Asynchronous Gradient Descent (\\asgd): there can be a large difference between convergence guarantees with increasing number of learners given that the cumulative number of training samples processed remains the same. Thus there is an inherent (and similar) inefficiency introduced in the convergence behavior when we attempt to take advantage of parallelism, either by increasing mini-batch size or by increase the number of learners.", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Onkar Bhardwaj;Guojing Cong", "authorids": "onkar.bhardwaj@gmail.com;gcong@us.ibm.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbhardwaj2017inefficiency,\ntitle={Inefficiency of stochastic gradient descent with larger mini-batches (and more learners)},\nauthor={Onkar Bhardwaj and Guojing Cong},\nyear={2017},\nurl={https://openreview.net/forum?id=Bk_zTU5eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Bk_zTU5eg", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9418880177714650713&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "Bkab5dqxe", "title": "A Compositional Object-Based Approach to Learning Physical Dynamics", "track": "main", "status": "Poster", "tldr": "We propose a factorization of a physical scene into composable object-based representations and also a model architecture whose compositional structure factorizes object dynamics into pairwise interactions.", "abstract": "We present the Neural Physics Engine (NPE), a framework for learning simulators of intuitive physics that naturally generalize across variable object count and different scene configurations. We propose a factorization of a physical scene into composable object-based representations and a neural network architecture whose compositional structure factorizes object dynamics into pairwise interactions. Like a symbolic physics engine, the NPE is endowed with generic notions of objects and their interactions; realized as a neural network, it can be trained via stochastic gradient descent to adapt to specific object properties and dynamics of different worlds. We evaluate the efficacy of our approach on simple rigid body dynamics in two-dimensional worlds. By comparing to less structured architectures, we show that the NPE's compositional representation of the structure in physical interactions improves its ability to predict movement, generalize across variable object count and different scene configurations, and infer latent properties of objects such as mass.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Michael Chang;Tomer Ullman;Antonio Torralba;Joshua Tenenbaum", "authorids": "mbchang@mit.edu;tomeru@mit.edu;torralba@mit.edu;jbt@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchang2017a,\ntitle={A Compositional Object-Based Approach to Learning Physical Dynamics},\nauthor={Michael Chang and Tomer Ullman and Antonio Torralba and Joshua Tenenbaum},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Bkab5dqxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Bkab5dqxe", "pdf_size": 0, "rating": "6;7;9", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 530, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9706972547667418204&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "BkbY4psgg", "title": "Making Neural Programming Architectures Generalize via Recursion", "track": "main", "status": "Oral", "tldr": "", "abstract": "Empirically, neural networks that attempt to learn programs from data have exhibited poor generalizability. Moreover, it has traditionally been difficult to reason about the behavior of these models beyond a certain level of input complexity. In order to address these issues, we propose augmenting neural architectures with a key abstraction: recursion. As an application, we implement recursion in the Neural Programmer-Interpreter framework on four tasks: grade-school addition, bubble sort, topological sort, and quicksort. We demonstrate superior generalizability and interpretability with small amounts of training data. Recursion divides the problem into smaller pieces and drastically reduces the domain of each neural network component, making it tractable to prove guarantees about the overall system\u2019s behavior. Our experience suggests that in order for neural architectures to robustly learn program semantics, it is necessary to incorporate a concept like recursion.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Jonathon Cai;Richard Shin;Dawn Song", "authorids": "jonathon@cs.berkeley.edu;ricshin@cs.berkeley.edu;dawnsong@cs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ncai2017making,\ntitle={Making Neural Programming Architectures Generalize via Recursion},\nauthor={Jonathon Cai and Richard Shin and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BkbY4psgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkbY4psgg", "pdf_size": 0, "rating": "8;8;9", "confidence": "3;4;5", "rating_avg": 8.333333333333334, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 154, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1023802487383538904&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14 }, { "id": "Bkbc-Vqeg", "title": "Learning Word-Like Units from Joint Audio-Visual Analylsis", "track": "main", "status": "Reject", "tldr": "", "abstract": "Given a collection of images and spoken audio captions, we present a method for discovering word-like acoustic units in the continuous speech signal and grounding them to semantically relevant image regions. For example, our model is able to detect spoken instances of the words ``lighthouse'' within an utterance and associate them with image regions containing lighthouses. We do not use any form of conventional automatic speech recognition, nor do we use any text transcriptions or conventional linguistic annotations. Our model effectively implements a form of spoken language acquisition, in which the computer learns not only to recognize word categories by sound, but also to enrich the words it learns with semantics by grounding them in images.", "keywords": "Speech;Computer vision;Deep learning;Multi-modal learning;Unsupervised Learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "David Harwath;James R. Glass", "authorids": "dharwath@mit.edu;glass@mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nharwath2017learning,\ntitle={Learning Word-Like Units from Joint Audio-Visual Analylsis},\nauthor={David Harwath and James R. Glass},\nyear={2017},\nurl={https://openreview.net/forum?id=Bkbc-Vqeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bkbc-Vqeg", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=527225551708857174&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "BkdpaH9ll", "title": "Boosting Image Captioning with Attributes", "track": "main", "status": "Reject", "tldr": "Boosting Image Captioning with Attributes", "abstract": "Automatically describing an image with a natural language has been an emerging challenge in both fields of computer vision and natural language processing. In this paper, we present Long Short-Term Memory with Attributes (LSTM-A) - a novel architecture that integrates attributes into the successful Convolutional Neural Networks (CNNs) plus Recurrent Neural Networks (RNNs) image captioning framework, by training them in an end-to-end manner. To incorporate attributes, we construct variants of architectures by feeding image representations and attributes into RNNs in different ways to explore the mutual but also fuzzy relationship between them. Extensive experiments are conducted on COCO image captioning dataset and our framework achieves superior results when compared to state-of-the-art deep models. Most remarkably, we obtain METEOR/CIDEr-D of 25.2%/98.6% on testing data of widely used and publicly available splits in \\citep{Karpathy:CVPR15} when extracting image representations by GoogleNet and achieve to date top-1 performance on COCO captioning Leaderboard.", "keywords": "Computer vision;Applications", "primary_area": "", "supplementary_material": "", "author": "Ting Yao;Yingwei Pan;Yehao Li;Zhaofan Qiu;Tao Mei", "authorids": "tiyao@microsoft.com;v-yipan@microsoft.com;v-yehl@microsoft.com;v-zhqiu@microsoft.com;tmei@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyao2017boosting,\ntitle={Boosting Image Captioning with Attributes},\nauthor={Ting Yao and Yingwei Pan and Yehao Li and Zhaofan Qiu and Tao Mei},\nyear={2017},\nurl={https://openreview.net/forum?id=BkdpaH9ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkdpaH9ll", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;5", "rating_avg": 5.0, "confidence_avg": 4.666666666666667, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 865, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18010599519948968721&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "Bkepl7cee", "title": "Parametric Exponential Linear Unit for Deep Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "Learning a parameterization of the ELU activation function improves its performance.", "abstract": "The activation function is an important component in Convolutional Neural Networks (CNNs). For instance, recent breakthroughs in Deep Learning can be attributed to the Rectified Linear Unit (ReLU). Another recently proposed activation function, the Exponential Linear Unit (ELU), has the supplementary property of reducing bias shift without explicitly centering the values at zero. In this paper, we show that learning a parameterization of ELU improves its performance. We analyzed our proposed Parametric ELU (PELU) in the context of vanishing gradients and provide a gradient-based optimization framework. We conducted several experiments on CIFAR-10/100 and ImageNet with different network architectures, such as NiN, Overfeat, All-CNN and ResNet. Our results show that our PELU has relative error improvements over ELU of 4.45% and 5.68% on CIFAR-10 and 100, and as much as 7.28% with only 0.0003% parameter increase on ImageNet. We also observed that Vgg using PELU tended to prefer activations saturating closer to zero, as in ReLU, except at the last layer, which saturated near -2. Finally, other presented results suggest that varying the shape of the activations during training along with the other parameters helps controlling vanishing gradients and bias shift, thus facilitating learning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ludovic Trottier;Philippe Gigu\u00e8re;Brahim Chaib-draa", "authorids": "ludovic.trottier.1@ulaval.ca;philippe.giguere@ift.ulaval.ca;brahim.chaib-draa@ift.ulaval.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntrottier2017parametric,\ntitle={Parametric Exponential Linear Unit for Deep Convolutional Neural Networks},\nauthor={Ludovic Trottier and Philippe Gigu{\\`e}re and Brahim Chaib-draa},\nyear={2017},\nurl={https://openreview.net/forum?id=Bkepl7cee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Bkepl7cee", "pdf_size": 0, "rating": "4;5;6;7", "confidence": "4;4;4;5", "rating_avg": 5.5, "confidence_avg": 4.25, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.7745966692414834, "gs_citation": 293, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15166150093841301194&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "BkfiXiUlg", "title": "Learning Efficient Algorithms with Hierarchical Attentive Memory", "track": "main", "status": "Reject", "tldr": "fast attention in O(log n); learned sorting algorithm that generalizes", "abstract": "In this paper, we propose and investigate a novel memory architecture for neural networks called Hierarchical Attentive Memory (HAM). It is based on a binary tree with leaves corresponding to memory cells. This allows HAM to perform memory access in O(log n) complexity, which is a significant improvement over the standard attention mechanism that requires O(n) operations, where n is the size of the memory. \n\nWe show that an LSTM network augmented with HAM can learn algorithms for problems like merging, sorting or binary searching from pure input-output examples. In particular, it learns to sort n numbers in time O(n log n) and generalizes well to input sequences much longer than the ones seen during the training. We also show that HAM can be trained to act like classic data structures: a stack, a FIFO queue and a priority queue.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Marcin Andrychowicz;Karol Kurach", "authorids": "marcin@openai.com;kkurach@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nandrychowicz2017learning,\ntitle={Learning Efficient Algorithms with Hierarchical Attentive Memory},\nauthor={Marcin Andrychowicz and Karol Kurach},\nyear={2017},\nurl={https://openreview.net/forum?id=BkfiXiUlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkfiXiUlg", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;5;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8395773790338973618&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "Bkfwyw5xg", "title": "Investigating Different Context Types and Representations for Learning Word Embeddings", "track": "main", "status": "Reject", "tldr": "This paper investigate different context types and representations for learning word embeddings.", "abstract": "The number of word embedding models is growing every year. Most of them learn word embeddings based on the co-occurrence information of words and their context. However, it's still an open question what is the best definition of context. We provide the first systematical investigation of different context types and representations for learning word embeddings. We conduct comprehensive experiments to evaluate their effectiveness under 4 tasks (21 datasets), which give us some insights about context selection. We hope that this paper, along with the published code, can serve as a guideline of choosing context for our community.\n", "keywords": "Unsupervised Learning;Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Bofang Li;Tao Liu;Zhe Zhao;Buzhou Tang;Xiaoyong Du", "authorids": "libofang@ruc.edu.cn;tliu@ruc.edu.cn;helloworld@ruc.edu.cn;tangbuzhou@gmail.com;duyong@ruc.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2017investigating,\ntitle={Investigating Different Context Types and Representations for Learning Word Embeddings},\nauthor={Bofang Li and Tao Liu and Zhe Zhao and Buzhou Tang and Xiaoyong Du},\nyear={2017},\nurl={https://openreview.net/forum?id=Bkfwyw5xg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Bkfwyw5xg", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_blL6IjYDYMJ:scholar.google.com/&scioq=Investigating+Different+Context+Types+and+Representations+for+Learning+Word+Embeddings&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BkjLkSqxg", "title": "LipNet: End-to-End Sentence-level Lipreading", "track": "main", "status": "Reject", "tldr": "LipNet is the first end-to-end sentence-level lipreading model to simultaneously learn spatiotemporal visual features and a sequence model.", "abstract": "Lipreading is the task of decoding text from the movement of a speaker's mouth. Traditional approaches separated the problem into two stages: designing or learning visual features, and prediction. More recent deep lipreading approaches are end-to-end trainable (Wand et al., 2016; Chung & Zisserman, 2016a). However, existing work on models trained end-to-end perform only word classification, rather than sentence-level sequence prediction. Studies have shown that human lipreading performance increases for longer words (Easton & Basala, 1982), indicating the importance of features capturing temporal context in an ambiguous communication channel. Motivated by this observation, we present LipNet, a model that maps a variable-length sequence of video frames to text, making use of spatiotemporal convolutions, a recurrent network, and the connectionist temporal classification loss, trained entirely end-to-end. To the best of our knowledge, LipNet is the first end-to-end sentence-level lipreading model that simultaneously learns spatiotemporal visual features and a sequence model. On the GRID corpus, LipNet achieves 95.2% accuracy in sentence-level, overlapped speaker split task, outperforming experienced human lipreaders and the previous 86.4% word-level state-of-the-art accuracy (Gergen et al., 2016).", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Yannis M. Assael;Brendan Shillingford;Shimon Whiteson;Nando de Freitas", "authorids": "yannis.assael@cs.ox.ac.uk;brendan.shillingford@cs.ox.ac.uk;shimon.whiteson@cs.ox.ac.uk;nando.de.freitas@cs.ox.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nassael2017lipnet,\ntitle={LipNet: End-to-End Sentence-level Lipreading},\nauthor={Yannis M. Assael and Brendan Shillingford and Shimon Whiteson and Nando de Freitas},\nyear={2017},\nurl={https://openreview.net/forum?id=BkjLkSqxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkjLkSqxg", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 28, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 531, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8138822997856823484&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "BkmM8Dceg", "title": "Warped Convolutions: Efficient Invariance to Spatial Transformations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Convolutional Neural Networks (CNNs) are extremely efficient, since they exploit the inherent translation-invariance of natural images. However, translation is just one of a myriad of useful spatial transformations. Can the same efficiency be attained when considering other spatial invariances? Such generalized convolutions have been considered in the past, but at a high computational cost. We present a construction that is simple and exact, yet has the same computational complexity that standard convolutions enjoy. It consists of a constant image warp followed by a simple convolution, which are standard blocks in deep learning toolboxes. With a carefully crafted warp, the resulting architecture can be made equivariant to a wide range of 2-parameters spatial transformations. We show encouraging results in realistic scenarios, including the estimation of vehicle poses in the Google Earth dataset (rotation and scale), and face poses in Annotated Facial Landmarks in the Wild (3D rotations under perspective).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Joao F. Henriques;Andrea Vedaldi", "authorids": "joao@robots.ox.ac.uk;vedaldi@robots.ox.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhenriques2017warped,\ntitle={Warped Convolutions: Efficient Invariance to Spatial Transformations},\nauthor={Joao F. Henriques and Andrea Vedaldi},\nyear={2017},\nurl={https://openreview.net/forum?id=BkmM8Dceg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=BkmM8Dceg", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 20, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10896480336756687375&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "Bkp_y7qxe", "title": "Unsupervised Deep Learning of State Representation Using Robotic Priors", "track": "main", "status": "Reject", "tldr": "This paper introduces a method for training a deep neural network to learn a representation of a robot's environment state using a priori knowledge.", "abstract": "Our understanding of the world depends highly on how we represent it. Using background knowledge about its complex underlying physical rules, our brain can produce intuitive and simplified representations which it can easily use to solve problems. The approach of this paper aims to reproduce this simplification process using a neural network to produce a simple low dimensional state representation of the world from images acquired by a robot. As proposed in Jonschkowski & Brock (2015), we train the neural network in an unsupervised way, using the \"a priori\" knowledge we have about the world as loss functions called \"robotic priors\" that we implemented through a siamese network. This approach has been used to learn a one dimension representation of a Baxter head position from raw images. The experiment resulted in a 97,7% correlation between the learned representation and the ground truth, and show that relevant visual features form the environment are learned.", "keywords": "Deep learning;Computer vision;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Timothee LESORT;David FILLIAT", "authorids": "timothee.lesort@ensta-paristech.fr;david.filliat@ensta-paristech.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlesort2017unsupervised,\ntitle={Unsupervised Deep Learning of State Representation Using Robotic Priors },\nauthor={Timothee LESORT and David FILLIAT},\nyear={2017},\nurl={https://openreview.net/forum?id=Bkp_y7qxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Bkp_y7qxe", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12791474303630022966&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "Bks8cPcxe", "title": "DeepDSL: A Compilation-based Domain-Specific Language for Deep Learning", "track": "main", "status": "Poster", "tldr": "DeepDSL(a DSL embedded in Scala) that compiles deep learning networks written in DeepDSL to Java source code, which runs on any GPU equipped machines with competitive efficiency as existing state-of-the-art tools (e.g. Caffe and Tensorflow)", "abstract": "In recent years, Deep Learning (DL) has found great success in domains such as multimedia understanding. However, the complex nature of multimedia data makes it difficult to develop DL-based software. The state-of-the-art tools, such as Caffe, TensorFlow, Torch7, and CNTK, while are successful in their applicable domains, are programming libraries with fixed user interface, internal representation, and execution environment. This makes it difficult to implement portable and customized DL applications.\n\nIn this paper, we present DeepDSL, a domain specific language (DSL) embedded in Scala, that compiles deep networks written in DeepDSL to Java source code. Deep DSL provides \n\n(1) intuitive constructs to support compact encoding of deep networks; \n(2) symbolic gradient derivation of the networks; \n(3) static analysis for memory consumption and error detection; and \n(4) DSL-level optimization to improve memory and runtime efficiency. \n\nDeepDSL programs are compiled into compact, efficient, customizable, and portable Java source code, which operates the CUDA and CUDNN interfaces running on NVIDIA GPU via a Java Native Interface (JNI) library. We evaluated DeepDSL with a number of popular DL networks. Our experiments show that the compiled programs have very competitive runtime performance and memory efficiency compared to the existing libraries.", "keywords": "Deep learning;Applications;Optimization", "primary_area": "", "supplementary_material": "", "author": "Tian Zhao;Xiao Bing Huang;Yu Cao", "authorids": "tzhao@uwm.edu;xiaobing@uwm.edu;ycao@cs.uml.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhao2017deepdsl,\ntitle={Deep{DSL}: A Compilation-based Domain-Specific Language for Deep Learning},\nauthor={Tian Zhao and Xiao Bing Huang and Yu Cao},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Bks8cPcxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Bks8cPcxe", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;3", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17960414107165056805&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "Bkul3t9ee", "title": "Unsupervised Perceptual Rewards for Imitation Learning", "track": "main", "status": "Workshop", "tldr": "Real robots learn new tasks from observing a few human demonstrations.", "abstract": "Reward function design and exploration time are arguably the biggest obstacles to the deployment of reinforcement learning (RL) agents in the real world. In many real-world tasks, designing a suitable reward function takes considerable manual engineering and often requires additional and potentially visible sensors to be installed just to measure whether the task has been executed successfully. Furthermore, many interesting tasks consist of multiple steps that must be executed in sequence. Even when the final outcome can be measured, it does not necessarily provide useful feedback on these implicit intermediate steps or sub-goals.\nTo address these issues, we propose leveraging the abstraction power of intermediate visual representations learned by deep models to quickly infer perceptual reward functions from small numbers of demonstrations. We present a method that is able to identify the key intermediate steps of a task from only a handful of demonstration sequences, and automatically identify the most discriminative features for identifying these steps. This method makes use of the features in a pre-trained deep model, but does not require any explicit sub-goal supervision. The resulting reward functions, which are dense and smooth, can then be used by an RL agent to learn to perform the task in real-world settings. To evaluate the learned reward functions, we present qualitative results on two real-world tasks and a quantitative evaluation against a human-designed reward function. We also demonstrate that our method can be used to learn a complex real-world door opening skill using a real robot, even when the demonstration used for reward learning is provided by a human using their own hand. To our knowledge, these are the first results showing that complex robotic manipulation skills can be learned directly and without supervised labels from a video of a human performing the task.\n", "keywords": "Computer vision;Deep learning;Unsupervised Learning;Reinforcement Learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Pierre Sermanet;Kelvin Xu;Sergey Levine", "authorids": "sermanet@google.com;kelvinxx@google.com;slevine@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer4", "site": "https://openreview.net/forum?id=Bkul3t9ee", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 197, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3067189350575490319&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "By14kuqxx", "title": "Bit-Pragmatic Deep Neural Network Computing", "track": "main", "status": "Workshop", "tldr": "A hardware accelerator for DNNs whose execution time for convolutional layers is proportional to the number of activation *bits* that are 1.", "abstract": "We quantify a source of ineffectual computations when processing the multiplications of the convolutional layers in Deep Neural Networks (DNNs) and propose Pragrmatic (PRA), an architecture that exploits it improving performance and energy efficiency. \nThe source of these ineffectual computations is best understood in the context of conventional multipliers which generate internally multiple terms, that is, products of the multiplicand and powers of two, which added together produce the final product. At runtime, many of these terms are zero as they are generated when the multiplicand is combined with the zero-bits of the multiplicator. While conventional bit-parallel multipliers calculate all terms in parallel to reduce individual product latency, PRA calculates only the non-zero terms resulting in a design whose execution time for convolutional layers is ideally proportional to the number of activation bits that are 1. Measurements demonstrate that for the convolutional layers on Convolutional Neural Networks and during inference, PRA improves performance by 4.3x over the DaDiaNao (DaDN) accelerator and by 4.5x when DaDN uses an 8-bit quantized representation. DaDN was reported to be 300x faster than commodity graphics processors. \n\n", "keywords": "Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Jorge Albericio;Patrick Judd;Alberto Delmas;Sayeh Sharify;Andreas Moshovos", "authorids": "jorge.albericio@gmail.com;judd@ece.utoronto.ca;delmas1@ece.utoronto.ca;sayeh@ece.utoronto.ca;moshovos@ece.utoronto.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=By14kuqxx", "pdf_size": 0, "rating": "5;6;7", "confidence": "2;3;2", "rating_avg": 6.0, "confidence_avg": 2.3333333333333335, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 322, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12554319613499195504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13 }, { "id": "By1snw5gl", "title": "L-SR1: A Second Order Optimization Method for Deep Learning", "track": "main", "status": "Reject", "tldr": "We describe L-SR1, a new second order method to train deep neural networks.", "abstract": "We describe L-SR1, a new second order method to train deep neural networks. Second order methods hold great promise for distributed training of deep networks. Unfortunately, they have not proven practical. Two significant barriers to their success are inappropriate handling of saddle points, and poor conditioning of the Hessian. L-SR1 is a practical second order method that addresses these concerns. We provide experimental results showing that L-SR1 performs at least as well as Nesterov's Accelerated Gradient Descent, on the MNIST and CIFAR10 datasets. For the CIFAR10 dataset, we see competitive performance on shallow networks like LeNet5, as well as on deeper networks like residual networks. Furthermore, we perform an experimental analysis of L-SR1 with respect to its hyper-parameters to gain greater intuition. Finally, we outline the potential usefulness of L-SR1 in distributed training of deep neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vivek Ramamurthy;Nigel Duffy", "authorids": "vivek.ramamurthy@sentient.ai;nigel.duffy@sentient.ai", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nramamurthy2017lsr,\ntitle={L-{SR}1: A Second Order Optimization Method for Deep Learning},\nauthor={Vivek Ramamurthy and Nigel Duffy},\nyear={2017},\nurl={https://openreview.net/forum?id=By1snw5gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=By1snw5gl", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10806682869871187282&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "By5e2L9gl", "title": "Trusting SVM for Piecewise Linear CNNs", "track": "main", "status": "Poster", "tldr": "Formulating CNN layerwise optimization as an SVM problem", "abstract": "We present a novel layerwise optimization algorithm for the learning objective of Piecewise-Linear Convolutional Neural Networks (PL-CNNs), a large class of convolutional neural networks. Specifically, PL-CNNs employ piecewise linear non-linearities such as the commonly used ReLU and max-pool, and an SVM classifier as the final layer. The key observation of our approach is that the prob- lem corresponding to the parameter estimation of a layer can be formulated as a difference-of-convex (DC) program, which happens to be a latent structured SVM. We optimize the DC program using the concave-convex procedure, which requires us to iteratively solve a structured SVM problem. This allows to design an opti- mization algorithm with an optimal learning rate that does not require any tuning. Using the MNIST, CIFAR and ImageNet data sets, we show that our approach always improves over the state of the art variants of backpropagation and scales to large data and large network settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Leonard Berrada;Andrew Zisserman;M. Pawan Kumar", "authorids": "lberrada@robots.ox.ac.uk;az@robots.ox.ac.uk;pawan@robots.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nberrada2017trusting,\ntitle={Trusting {SVM} for Piecewise Linear {CNN}s},\nauthor={Leonard Berrada and Andrew Zisserman and M. Pawan Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=By5e2L9gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=By5e2L9gl", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16456417541041764932&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "ByBwSPcex", "title": "Song From PI: A Musically Plausible Network for Pop Music Generation", "track": "main", "status": "Workshop", "tldr": "We present a novel hierarchical RNN for generating pop music, where the layers and the structure of the hierarchy encode our prior knowledge about how pop music is composed.", "abstract": "We present a novel framework for generating pop music. Our model is a hierarchical Recurrent Neural Network, where the layers and the structure of the hierarchy encode our prior knowledge about how pop music is composed. In particular, the bottom layers generate the melody, while the higher levels produce the drums and chords. We conduct several human studies that show strong preference of our generated music over that produced by the recent method by Google. We additionally show two applications of our framework: neural dancing and karaoke, as well as neural story singing.", "keywords": "Applications", "primary_area": "", "supplementary_material": "", "author": "Hang Chu;Raquel Urtasun;Sanja Fidler", "authorids": "chuhang1122@cs.toronto.edu;urtasun@cs.toronto.edu;fidler@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ByBwSPcex", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.18898223650461363, "gs_citation": 179, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11992580415707728615&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "ByC7ww9le", "title": "Gaussian Attention Model and Its Application to Knowledge Base Embedding and Question Answering", "track": "main", "status": "Reject", "tldr": "We make (simple) knowledge base queries differentiable using the Gaussian attention model.", "abstract": "We propose the Gaussian attention model for content-based neural memory\naccess. With the proposed attention model, a neural network has the\nadditional degree of freedom to control the focus of its attention from\na laser sharp attention to a broad attention. It is applicable whenever\nwe can assume that the distance in the latent space reflects some notion\nof semantics. We use the proposed attention model as a scoring function\nfor the embedding of a knowledge base into a continuous vector space and\nthen train a model that performs question answering about the entities\nin the knowledge base. The proposed attention model can handle both the\npropagation of uncertainty when following a series of relations and also\nthe conjunction of conditions in a natural way. On a dataset of soccer\nplayers who participated in the FIFA World Cup 2014, we demonstrate that\nour model can handle both path queries and conjunctive queries well.", "keywords": "Natural language processing;Supervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Liwen Zhang;John Winn;Ryota Tomioka", "authorids": "liwenz@cs.uchicago.edu;jwinn@microsoft.com;ryoto@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2017gaussian,\ntitle={Gaussian Attention Model and Its Application to Knowledge Base Embedding and Question Answering},\nauthor={Liwen Zhang and John Winn and Ryota Tomioka},\nyear={2017},\nurl={https://openreview.net/forum?id=ByC7ww9le}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ByC7ww9le", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10000893928552541863&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "ByEPMj5el", "title": "Out-of-class novelty generation: an experimental foundation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent advances in machine learning have brought the field closer to computational creativity research. From a creativity research point of view, this offers the potential to study creativity in relationship with knowledge acquisition. From a machine learning perspective, however, several aspects of creativity need to be better defined to allow the machine learning community to develop and test hypotheses in a systematic way. We propose an actionable definition of creativity as the generation of out-of-distribution novelty. We assess several metrics designed for evaluating the quality of generative models on this new task. We also propose a new experimental setup. Inspired by the usual held-out validation, we hold out entire classes for evaluating the generative potential of models. The goal of the novelty generator is then to use training classes to build a model that can generate objects from future (hold-out) classes, unknown at training time - and thus, are novel with respect to the knowledge the model incorporates. Through extensive experiments on various types of generative models, we are able to find architectures and hyperparameter combinations which lead to out-of-distribution novelty.\n", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Mehdi Cherti;Bal\u00e1zs K\u00e9gl;Ak\u0131n Kazak\u00e7\u0131", "authorids": "mehdicherti@gmail.com;balazskegl@gmail.com;akin.kazakci@mines-paristech.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncherti2017outofclass,\ntitle={Out-of-class novelty generation: an experimental foundation},\nauthor={Mehdi Cherti and Bal{\\'a}zs K{\\'e}gl and Ak{\\i}n Kazak{\\c{c}}{\\i}},\nyear={2017},\nurl={https://openreview.net/forum?id=ByEPMj5el}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=ByEPMj5el", "pdf_size": 0, "rating": "4;5;6;7", "confidence": "4;3;3;4", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8654295836607983539&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "ByG4hz5le", "title": "Adaptive Feature Abstraction for Translating Video to Language", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Previous models for video captioning often use the output from a specific layer of a Convolutional Neural Network (CNN) as video representations, preventing them from modeling rich, varying context-dependent semantics in video descriptions. In this paper, we propose a new approach to generating adaptive spatiotemporal representations of videos for a captioning task. For this purpose, novel attention mechanisms with spatiotemporal alignment is employed to adaptively and sequentially focus on different layers of CNN features (levels of feature ``abstraction''), as well as local spatiotemporal regions of the feature maps at each layer. Our approach is evaluated on three benchmark datasets: YouTube2Text, M-VAD and MSR-VTT. Along with visualizing the results and how the model works, these experiments quantitatively demonstrate the effectiveness of the proposed adaptive spatiotemporal feature abstraction for translating videos to sentences with rich semantics.", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Yunchen Pu;Martin Renqiang Min;Zhe Gan;Lawrence Carin", "authorids": "yunchen.pu@duke.edu;renqiang@nec-labs.com;zhe.gan@duke.edu;lcarin@duke.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByG4hz5le", "pdf_size": 0, "rating": "4;4;7", "confidence": "5;4;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 22, "authors#_avg": 4, "corr_rating_confidence": -0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6050672944306405622&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "ByG8A7cee", "title": "Reference-Aware Language Models", "track": "main", "status": "Reject", "tldr": "reference-aware language models", "abstract": "We propose a general class of language models that treat reference as an explicit stochastic latent variable. This architecture allows models to create mentions of entities and their attributes by accessing external databases (required by, e.g., dialogue generation and recipe generation) and internal state (required by, e.g. language models which are aware of coreference). This facilitates the incorporation of information that can be accessed in predictable locations in databases or dis- course context, even when the targets of the reference may be rare words. Experiments on three tasks show our model variants outperform models based on deterministic attention.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Zichao Yang;Phil Blunsom;Chris Dyer;Wang Ling", "authorids": "zichaoy@cs.cmu.edu;pblunsom@google.com;cdyer@google.com;lingwang@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyang2017referenceaware,\ntitle={Reference-Aware Language Models},\nauthor={Zichao Yang and Phil Blunsom and Chris Dyer and Wang Ling},\nyear={2017},\nurl={https://openreview.net/forum?id=ByG8A7cee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByG8A7cee", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3306236128977785938&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "ByIAPUcee", "title": "Frustratingly Short Attention Spans in Neural Language Modeling", "track": "main", "status": "Poster", "tldr": "We investigate various memory-augmented neural language models and compare them against state-of-the-art architectures.", "abstract": "Current language modeling architectures often use recurrent neural networks. Recently, various methods for incorporating differentiable memory into these architectures have been proposed. When predicting the next token, these models query information from a memory of the recent history and thus can facilitate learning mid- and long-range dependencies. However, conventional attention models produce a single output vector per time step that is used for predicting the next token as well as the key and value of a differentiable memory of the history of tokens. In this paper, we propose a key-value attention mechanism that produces separate representations for the key and value of a memory, and for a representation that encodes the next-word distribution. This usage of past memories outperforms existing memory-augmented neural language models on two corpora. Yet, we found that it mainly utilizes past memory only of the previous five representations. This led to the unexpected main finding that a much simpler model which simply uses a concatenation of output representations from the previous three-time steps is on par with more sophisticated memory-augmented neural language models.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Micha\u0142 Daniluk;Tim Rockt\u00e4schel;Johannes Welbl;Sebastian Riedel", "authorids": "michal.daniluk.15@ucl.ac.uk;t.rocktaschel@cs.ucl.ac.uk;j.welbl@cs.ucl.ac.uk;s.riedel@cs.ucl.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ndaniluk2017frustratingly,\ntitle={Frustratingly Short Attention Spans in Neural Language Modeling},\nauthor={Micha{\\l} Daniluk and Tim Rockt{\\\"a}schel and Johannes Welbl and Sebastian Riedel},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ByIAPUcee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByIAPUcee", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 157, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=383651302971503829&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "ByOK0rwlx", "title": "Ternary Weight Decomposition and Binary Activation Encoding for Fast and Compact Neural Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper aims to reduce test-time computational load of a deep neural network. Unlike previous methods which factorize a weight matrix into multiple real-valued matrices, our method factorizes both weights and activations into integer and noninteger components. In our method, the real-valued weight matrix is approximated by a multiplication of a ternary matrix and a real-valued co-efficient matrix. Since the ternary matrix consists of three integer values, {-1, 0, +1}, it only consumes 2 bits per element. At test-time, an activation vector that passed from a previous layer is also transformed into a weighted sum of binary vectors, {-1, +1}, which enables fast feed-forward propagation based on simple logical operations: AND, XOR, and bit count. This makes it easier to deploy a deep network on low-power CPUs or to design specialized hardware.\nIn our experiments, we tested our method on three different networks: a CNN for handwritten digits, VGG-16 model for ImageNet classification, and VGG-Face for large-scale face recognition. In particular, when we applied our method to three fully connected layers in the VGG-16, 15x acceleration and memory compression up to 5.2% were achieved with only a 1.43% increase in the top-5 error. Our experiments also revealed that compressing convolutional layers can accelerate inference of the entire network in exchange of slight increase in error.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Mitsuru Ambai;Takuya Matsumoto;Takayoshi Yamashita;Hironobu Fujiyoshi", "authorids": "manbai@d-itlab.co.jp;tmatsumoto@d-itlab.co.jp;yamashita@cs.chubu.ac.jp;hf@cs.chubu.ac.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nambai2017ternary,\ntitle={Ternary Weight Decomposition and Binary Activation Encoding for Fast and Compact Neural Network},\nauthor={Mitsuru Ambai and Takuya Matsumoto and Takayoshi Yamashita and Hironobu Fujiyoshi},\nyear={2017},\nurl={https://openreview.net/forum?id=ByOK0rwlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByOK0rwlx", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;3", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5149582468760559515&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "ByOvsIqeg", "title": "Regularizing CNNs with Locally Constrained Decorrelations", "track": "main", "status": "Poster", "tldr": "We show that that models regularized with local feature decorrelation have lower overfitting.", "abstract": "Regularization is key for deep learning since it allows training more complex models while keeping lower levels of overfitting. However, the most prevalent regularizations do not leverage all the capacity of the models since they rely on reducing the effective number of parameters. Feature decorrelation is an alternative for using the full capacity of the models but the overfitting reduction margins are too narrow given the overhead it introduces. In this paper, we show that regularizing negatively correlated features is an obstacle for effective decorrelation and present OrthoReg, a novel regularization technique that locally enforces feature orthogonality. As a result, imposing locality constraints in feature decorrelation removes interferences between negatively correlated feature weights, allowing the regularizer to reach higher decorrelation bounds, and reducing the overfitting more effectively. \nIn particular, we show that the models regularized with OrthoReg have higher accuracy bounds even when batch normalization and dropout are present. Moreover, since our regularization is directly performed on the weights, it is especially suitable for fully convolutional neural networks, where the weight space is constant compared to the feature map space. As a result, we are able to reduce the overfitting of state-of-the-art CNNs on CIFAR-10, CIFAR-100, and SVHN.", "keywords": "Computer vision;Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Pau Rodr\u00edguez;Jordi Gonz\u00e0lez;Guillem Cucurull;Josep M. Gonfaus;Xavier Roca", "authorids": "pau.rodriguez@cvc.uab.es;;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nrodr{\\'\\i}guez2017regularizing,\ntitle={Regularizing {CNN}s with Locally Constrained Decorrelations},\nauthor={Pau Rodr{\\'\\i}guez and Jordi Gonz{\\`a}lez and Guillem Cucurull and Josep M. Gonfaus and Xavier Roca},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ByOvsIqeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByOvsIqeg", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;3", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 163, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3840535160739502869&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "ByQPVFull", "title": "Training Group Orthogonal Neural Networks with Privileged Information", "track": "main", "status": "Reject", "tldr": "A convolutional neural network for image classification which encourages learning more diverse feature representations by using image segmentations as privileged information.", "abstract": "Learning rich and diverse feature representation are always desired for deep convolutional neural networks (CNNs). Besides, when auxiliary annotations are available for specific data, simply ignoring them would be a great waste. In this paper, we incorporate these auxiliary annotations as privileged information and propose a novel CNN model that is able to maximize inherent diversity of a CNN model such that the model can learn better feature representation with a stronger generalization ability. More specifically, we propose a group orthogonal convolutional neural network (GoCNN) to learn features from foreground and background in an orthogonal way by exploiting privileged information for optimization, which automatically emphasizes feature diversity within a single model. Experiments on two benchmark datasets, ImageNet and PASCAL VOC, well demonstrate the effectiveness and high generalization ability of our proposed GoCNN models.", "keywords": "Deep learning;Computer vision;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Yunpeng Chen;Xiaojie Jin;Jiashi Feng;Shuicheng Yan", "authorids": "chenyunpeng@u.nus.edu;xiaojie.jin@u.nus.edu;elefjia@nus.edu.sg;yanshuicheng@360.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2017training,\ntitle={Training Group Orthogonal Neural Networks with Privileged Information},\nauthor={Yunpeng Chen and Xiaojie Jin and Jiashi Feng and Shuicheng Yan},\nyear={2017},\nurl={https://openreview.net/forum?id=ByQPVFull}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByQPVFull", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17342921486659618088&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "ByToKu9ll", "title": "Evaluation of Defensive Methods for DNNs against Multiple Adversarial Evasion Models", "track": "main", "status": "Reject", "tldr": "robust adversarial retraining", "abstract": "Due to deep cascades of nonlinear units, deep neural networks (DNNs) can automatically learn non-local generalization priors from data and have achieved high performance in various applications.\nHowever, such properties have also opened a door for adversaries to generate the so-called adversarial examples to fool DNNs. Specifically, adversaries can inject small perturbations to the input data and therefore decrease the performance of deep neural networks significantly.\nEven worse, these adversarial examples have the transferability to attack a black-box model based on finite queries without knowledge of the target model. \nTherefore, we aim to empirically compare different defensive strategies against various adversary models and analyze the cross-model efficiency for these robust learners. We conclude that the adversarial retraining framework also has the transferability, which can defend adversarial examples without requiring prior knowledge of the adversary models.\nWe compare the general adversarial retraining framework with the state-of-the-art robust deep neural networks, such as distillation, autoencoder stacked with classifier (AEC), and our improved version, IAEC, to evaluate their robustness as well as the vulnerability in terms of the distortion required to mislead the learner.\nOur experimental results show that the adversarial retraining framework can defend most of the adversarial examples notably and consistently without adding additional\nvulnerabilities or performance penalty to the original model.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Xinyun Chen;Bo Li;Yevgeniy Vorobeychik", "authorids": "jungyhuk@gmail.com;bbbli@umich.edu;yevgeniy.vorobeychik@vanderbilt.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2017evaluation,\ntitle={Evaluation of Defensive Methods for {DNN}s against Multiple Adversarial Evasion Models},\nauthor={Xinyun Chen and Bo Li and Yevgeniy Vorobeychik},\nyear={2017},\nurl={https://openreview.net/forum?id=ByToKu9ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByToKu9ll", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12552966878082092386&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ByW2Avqgg", "title": "Neural Causal Regularization under the Independence of Mechanisms Assumption", "track": "main", "status": "Reject", "tldr": "We designed a neural causal regularizer to encourage predictive models to be more causal.", "abstract": "Neural networks provide a powerful framework for learning the association between input and response variables and making accurate predictions. However, in many applications such as healthcare, it is important to identify causal relationships between the inputs and the response variables to be able to change the response variables by intervention on the inputs. In pursuit of models whose predictive power comes maximally from causal variables, we propose a novel causal regularizer based on the independence of mechanisms assumption. We utilize the causal regularizer to steer deep neural network architectures towards causally-interpretable solutions. We perform a large-scale analysis of electronic health records. Employing expert's judgment as the causal ground-truth, we show that our causally-regularized algorithm outperforms its L1-regularized equivalence both in predictive performance as well as causal relevance. Finally, we show that the proposed causal regularizer can be used together with representation learning algorithms to yield up to 20% improvement in the causality score of the generated hypotheses.", "keywords": "Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Mohammad Taha Bahadori;Krzysztof Chalupka;Edward Choi;Robert Chen;Walter F. Stewart;Jimeng Sun", "authorids": "bahadori@gatech.edu;kjchalup@caltech.edu;mp2893@gatech.edu;rchen87@gatech.edu;StewarWF@sutterhealth.org;jsun@cc.gatech.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nbahadori2017neural,\ntitle={Neural Causal Regularization under the Independence of Mechanisms Assumption},\nauthor={Mohammad Taha Bahadori and Krzysztof Chalupka and Edward Choi and Robert Chen and Walter F. Stewart and Jimeng Sun},\nyear={2017},\nurl={https://openreview.net/forum?id=ByW2Avqgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByW2Avqgg", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;5", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 19, "authors#_avg": 6, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FjbIdnzRJJAJ:scholar.google.com/&scioq=Neural+Causal+Regularization+under+the+Independence+of+Mechanisms+Assumption&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "ByZvfijeg", "title": "Higher Order Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "we study novel neural network structures to better model long term dependency in sequential data", "abstract": "In this paper, we study novel neural network structures to better model long term dependency in sequential data. \nWe propose to use more memory units to keep track of more preceding states in recurrent neural networks (RNNs), which are all recurrently fed to the hidden layers as feedback through different weighted paths. By extending the popular\nrecurrent structure in RNNs, we provide the models with better short-term memory mechanism to learn long term dependency in sequences. Analogous to digital filters in signal processing, we call these structures as higher order RNNs (HORNNs). Similar to RNNs, HORNNs can also be learned using the back-propagation through time method. HORNNs are generally applicable to a variety of sequence modelling tasks. In this work, we have examined HORNNs for the language modeling task using two popular data sets, namely the Penn Treebank (PTB) and English text8. Experimental results have shown that the proposed HORNNs yield the state-of-the-art performance on both data sets, significantly outperforming the regular RNNs as well as the popular LSTMs. ", "keywords": "Deep learning;Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Rohollah Soltani;Hui Jiang", "authorids": "rsoltani@cse.yorku.ca;hj@cse.yorku.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsoltani2017higher,\ntitle={Higher Order Recurrent Neural Networks},\nauthor={Rohollah Soltani and Hui Jiang},\nyear={2017},\nurl={https://openreview.net/forum?id=ByZvfijeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByZvfijeg", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14301410513548080158&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "BybtVK9lg", "title": "Autoencoding Variational Inference For Topic Models", "track": "main", "status": "Poster", "tldr": "We got autoencoding variational bayes to work for latent Dirichlet allocation using one weird trick. The new inference method then made it easy to make a new topic model that works even better than LDA.", "abstract": "Topic models are one of the most popular methods for learning representations of\ntext, but a major challenge is that any change to the topic model requires mathematically\nderiving a new inference algorithm. A promising approach to address\nthis problem is autoencoding variational Bayes (AEVB), but it has proven diffi-\ncult to apply to topic models in practice. We present what is to our knowledge the\nfirst effective AEVB based inference method for latent Dirichlet allocation (LDA),\nwhich we call Autoencoded Variational Inference For Topic Model (AVITM). This\nmodel tackles the problems caused for AEVB by the Dirichlet prior and by component\ncollapsing. We find that AVITM matches traditional methods in accuracy\nwith much better inference time. Indeed, because of the inference network, we\nfind that it is unnecessary to pay the computational cost of running variational\noptimization on test data. Because AVITM is black box, it is readily applied\nto new topic models. As a dramatic illustration of this, we present a new topic\nmodel called ProdLDA, that replaces the mixture model in LDA with a product\nof experts. By changing only one line of code from LDA, we find that ProdLDA\nyields much more interpretable topics, even if LDA is trained via collapsed Gibbs\nsampling.", "keywords": "Deep learning;Unsupervised Learning;Applications;Optimization", "primary_area": "", "supplementary_material": "", "author": "Akash Srivastava;Charles Sutton", "authorids": "akash.srivastava@ed.ac.uk;csutton@inf.ed.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nsrivastava2017autoencoding,\ntitle={Autoencoding Variational Inference For Topic Models},\nauthor={Akash Srivastava and Charles Sutton},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BybtVK9lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=BybtVK9lg", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;3", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 24, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 770, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6043631909895723817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "BycCx8qex", "title": "DRAGNN: A Transition-Based Framework for Dynamically Connected Neural Networks", "track": "main", "status": "Reject", "tldr": "Modular framework for dynamically unrolled neural architectures improves structured prediction tasks", "abstract": "In this work, we present a compact, modular framework for constructing new recurrent neural architectures. Our basic module is a new generic unit, the Transition Based Recurrent Unit (TBRU). In addition to hidden layer activations, TBRUs have discrete state dynamics that allow network connections to be built dynamically as a function of intermediate activations. By connecting multiple TBRUs, we can extend and combine commonly used architectures such as sequence-to-sequence, attention mechanisms, and recursive tree-structured models. A TBRU can also serve as both an {\\em encoder} for downstream tasks and as a {\\em decoder} for its own task simultaneously, resulting in more accurate multi-task learning. We call our approach Dynamic Recurrent Acyclic Graphical Neural Networks, or DRAGNN. We show that DRAGNN is significantly more accurate and efficient than seq2seq with attention for syntactic dependency parsing and yields more accurate multi-task learning for extractive summarization tasks.\n", "keywords": "Natural language processing;Deep learning;Multi-modal learning;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Lingpeng Kong;Chris Alberti;Daniel Andor;Ivan Bogatyy;David Weiss", "authorids": "lingpenk@cs.cmu.edu;chrisalberti@google.com;andor@google.com;bogatyy@google.com;djweiss@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkong2017dragnn,\ntitle={{DRAGNN}: A Transition-Based Framework for Dynamically Connected Neural Networks},\nauthor={Lingpeng Kong and Chris Alberti and Daniel Andor and Ivan Bogatyy and David Weiss},\nyear={2017},\nurl={https://openreview.net/forum?id=BycCx8qex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer6;AnonReviewer3;AnonReviewer5", "site": "https://openreview.net/forum?id=BycCx8qex", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5259848300388035006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BydARw9ex", "title": "Capacity and Trainability in Recurrent Neural Networks", "track": "main", "status": "Poster", "tldr": "", "abstract": "Two potential bottlenecks on the expressiveness of recurrent neural networks (RNNs) are their ability to store information about the task in their parameters, and to store information about the input history in their units. We show experimentally that all common RNN architectures achieve nearly the same per-task and per-unit capacity bounds with careful training, for a variety of tasks and stacking depths. They can store an amount of task information which is linear in the number of parameters, and is approximately 5 bits per parameter. They can additionally store approximately one real number from their input history per hidden unit. We further find that for several tasks it is the per-task parameter capacity bound that determines performance. These results suggest that many previous results comparing RNN architectures are driven primarily by differences in training effectiveness, rather than differences in capacity. Supporting this observation, we compare training difficulty for several architectures, and show that vanilla RNNs are far more difficult to train, yet have slightly higher capacity. Finally, we propose two novel RNN architectures, one of which is easier to train than the LSTM or GRU for deeply stacked architectures.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Jasmine Collins;Jascha Sohl-Dickstein;David Sussillo", "authorids": "jlcollins@google.com;jaschasd@google.com;sussillo@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ncollins2017capacity,\ntitle={Capacity and Trainability in Recurrent Neural Networks},\nauthor={Jasmine Collins and Jascha Sohl-Dickstein and David Sussillo},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BydARw9ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BydARw9ex", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;4", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 249, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1555192084448119331&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "BydrOIcle", "title": "Unrolled Generative Adversarial Networks", "track": "main", "status": "Poster", "tldr": "We introduce a method to stabilize Generative Adversarial Networks by defining the generator objective with respect to an unrolled optimization of the discriminator. ", "abstract": "We introduce a method to stabilize Generative Adversarial Networks (GANs) by defining the generator objective with respect to an unrolled optimization of the discriminator. This allows training to be adjusted between using the optimal discriminator in the generator's objective, which is ideal but infeasible in practice, and using the current value of the discriminator, which is often unstable and leads to poor solutions. We show how this technique solves the common problem of mode collapse, stabilizes training of GANs with complex recurrent generators, and increases diversity and coverage of the data distribution by the generator.", "keywords": "Deep learning;Unsupervised Learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Luke Metz;Ben Poole;David Pfau;Jascha Sohl-Dickstein", "authorids": "lmetz@google.com;poole@cs.stanford.edu;pfau@google.com;jaschasd@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmetz2017unrolled,\ntitle={Unrolled Generative Adversarial Networks},\nauthor={Luke Metz and Ben Poole and David Pfau and Jascha Sohl-Dickstein},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BydrOIcle}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=BydrOIcle", "pdf_size": 0, "rating": "7;7;9", "confidence": "5;5;5", "rating_avg": 7.666666666666667, "confidence_avg": 5.0, "replies_avg": 19, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 4189, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13347995274615703652&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 18 }, { "id": "Bygq-H9eg", "title": "An Analysis of Deep Neural Network Models for Practical Applications", "track": "main", "status": "Reject", "tldr": "Analysis of ImageNet winning architectures in terms of accuracy, memory footprint, parameters, operations count, inference time and power consumption.", "abstract": "Since the emergence of Deep Neural Networks (DNNs) as a prominent technique in the field of computer vision, the ImageNet classification challenge has played a major role in advancing the state-of-the-art. While accuracy figures have steadily increased, the resource utilisation of winning models has not been properly taken into account. In this work, we present a comprehensive analysis of important metrics in practical applications: accuracy, memory footprint, parameters, operations count, inference time and power consumption. Key findings are: (1) power consumption is independent of batch size and architecture; (2) accuracy and inference time are in a hyperbolic relationship; (3) energy constraint are an upper bound on the maximum achievable accuracy and model complexity; (4) the number of operations is a reliable estimate of the inference time. We believe our analysis provides a compelling set of information that helps design and engineer efficient DNNs.", "keywords": "Computer vision;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Alfredo Canziani;Adam Paszke;Eugenio Culurciello", "authorids": "canziani@purdue.edu;a.paszke@students.mimuw.edu.pl;euge@purdue.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncanziani2017an,\ntitle={An Analysis of Deep Neural Network Models for Practical Applications},\nauthor={Alfredo Canziani and Adam Paszke and Eugenio Culurciello},\nyear={2017},\nurl={https://openreview.net/forum?id=Bygq-H9eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bygq-H9eg", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1789, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7120878747763061713&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Byiy-Pqlx", "title": "Lie-Access Neural Turing Machines", "track": "main", "status": "Poster", "tldr": "We generalize Turing machines to the continuous setting using Lie group actions on manifolds.", "abstract": "\n External neural memory structures have recently become a popular tool for\n algorithmic deep learning\n (Graves et al. 2014; Weston et al. 2014). These models\n generally utilize differentiable versions of traditional discrete\n memory-access structures (random access, stacks, tapes) to provide\n the storage necessary for computational tasks. In\n this work, we argue that these neural memory systems lack specific\n structure important for relative indexing, and propose an\n alternative model, Lie-access memory, that is explicitly designed\n for the neural setting. In this paradigm, memory is accessed using\n a continuous head in a key-space manifold. The head is moved via Lie\n group actions, such as shifts or rotations, generated by a\n controller, and memory access is performed by linear smoothing in\n key space. We argue that Lie groups provide a natural generalization\n of discrete memory structures, such as Turing machines, as they\n provide inverse and identity operators while maintaining\n differentiability. To experiment with this approach, we implement\n a simplified Lie-access neural Turing machine (LANTM) with\n different Lie groups. We find that this approach is able to perform\n well on a range of algorithmic tasks.", "keywords": "Natural language processing;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Greg Yang;Alexander Rush", "authorids": "gyang@college.harvard.edu;srush@seas.harvard.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nyang2017lieaccess,\ntitle={Lie-Access Neural Turing Machines},\nauthor={Greg Yang and Alexander Rush},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Byiy-Pqlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=Byiy-Pqlx", "pdf_size": 0, "rating": "6;6;7;8", "confidence": "3;4;4;4", "rating_avg": 6.75, "confidence_avg": 3.75, "replies_avg": 19, "authors#_avg": 2, "corr_rating_confidence": 0.5222329678670935, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12879493850593057636&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "Byj72udxe", "title": "Pointer Sentinel Mixture Models", "track": "main", "status": "Poster", "tldr": "Pointer sentinel mixture models provide a method to combine a traditional vocabulary softmax with a pointer network, providing state of the art results in language modeling on PTB and the newly introduced WikiText with few extra parameters.", "abstract": "Recent neural network sequence models with softmax classifiers have achieved their best language modeling performance only with very large hidden states and large vocabularies. Even then they struggle to predict rare or unseen words even if the context makes the prediction unambiguous. We introduce the pointer sentinel mixture architecture for neural sequence models which has the ability to either reproduce a word from the recent context or produce a word from a standard softmax classifier. Our pointer sentinel-LSTM model achieves state of the art language modeling performance on the Penn Treebank (70.9 perplexity) while using far fewer parameters than a standard softmax LSTM. In order to evaluate how well language models can exploit longer contexts and deal with more realistic vocabularies and corpora we also introduce the freely available WikiText corpus.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Stephen Merity;Caiming Xiong;James Bradbury;Richard Socher", "authorids": "smerity@salesforce.com;cxiong@salesforce.com;james.bradbury@salesforce.com;rsocher@salesforce.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmerity2017pointer,\ntitle={Pointer Sentinel Mixture Models},\nauthor={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Byj72udxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Byj72udxe", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;4", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2949, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17812832384777278922&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Byk-VI9eg", "title": "Generative Multi-Adversarial Networks", "track": "main", "status": "Poster", "tldr": "GANs with multiple discriminators accelerate training to more robust performance.", "abstract": "Generative adversarial networks (GANs) are a framework for producing a generative model by way of a two-player minimax game. In this paper, we propose the \\emph{Generative Multi-Adversarial Network} (GMAN), a framework that extends GANs to multiple discriminators. In previous work, the successful training of GANs requires modifying the minimax objective to accelerate training early on. In contrast, GMAN can be reliably trained with the original, untampered objective. We explore a number of design perspectives with the discriminator role ranging from formidable adversary to forgiving teacher. Image generation tasks comparing the proposed framework to standard GANs demonstrate GMAN produces higher quality samples in a fraction of the iterations when measured by a pairwise GAM-type metric.", "keywords": "Deep learning;Unsupervised Learning;Games", "primary_area": "", "supplementary_material": "", "author": "Ishan Durugkar;Ian Gemp;Sridhar Mahadevan", "authorids": "idurugkar@cs.umass.edu;imgemp@cs.umass.edu;mahadeva@cs.umass.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndurugkar2017generative,\ntitle={Generative Multi-Adversarial Networks},\nauthor={Ishan Durugkar and Ian Gemp and Sridhar Mahadevan},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Byk-VI9eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Byk-VI9eg", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 23, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 484, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11832516614055653155&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "BylSPv9gx", "title": "Exploring Sparsity in Recurrent Neural Networks", "track": "main", "status": "Poster", "tldr": "Reduce parameter count in recurrent neural networks to create smaller models for faster deployment", "abstract": "Recurrent neural networks (RNN) are widely used to solve a variety of problems and as the quantity of data and the amount of available compute have increased, so have model sizes. The number of parameters in recent state-of-the-art networks makes them hard to deploy, especially on mobile phones and embedded devices. The challenge is due to both the size of the model and the time it takes to evaluate it. In order to deploy these RNNs efficiently, we propose a technique to reduce the parameters of a network by pruning weights during the initial training of the network. At the end of training, the parameters of the network are sparse while accuracy is still close to the original dense neural network. The network size is reduced by 8\u00d7 and the time required to train the model remains constant. Additionally, we can prune a larger dense network to achieve better than baseline performance while still reducing the total number of parameters significantly. Pruning RNNs reduces the size of the model and can also help achieve significant inference time speed-up using sparse GEMMs. Benchmarks show that using our technique model size can be reduced by 90% and speed-up is around 2\u00d7 to 7\u00d7.", "keywords": "Speech;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Sharan Narang;Greg Diamos;Shubho Sengupta;Erich Elsen", "authorids": "sharan@baidu.com;gdiamos@baidu.com;ssengupta@baidu.com;eriche@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nnarang2017exploring,\ntitle={Exploring Sparsity in Recurrent Neural Networks},\nauthor={Sharan Narang and Greg Diamos and Shubho Sengupta and Erich Elsen},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BylSPv9gx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=BylSPv9gx", "pdf_size": 0, "rating": "6;7", "confidence": "4;3", "rating_avg": 6.5, "confidence_avg": 3.5, "replies_avg": 18, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999999, "gs_citation": 395, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15665014708787597981&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8 }, { "id": "ByldLrqlx", "title": "DeepCoder: Learning to Write Programs", "track": "main", "status": "Poster", "tldr": "", "abstract": "We develop a first line of attack for solving programming competition-style problems from input-output examples using deep learning. The approach is to train a neural network to predict properties of the program that generated the outputs from the inputs. We use the neural network's predictions to augment search techniques from the programming languages community, including enumerative search and an SMT-based solver. Empirically, we show that our approach leads to an order of magnitude speedup over the strong non-augmented baselines and a Recurrent Neural Network approach, and that we are able to solve problems of difficulty comparable to the simplest problems on programming competition websites.", "keywords": "Deep learning;Supervised Learning;Applications;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Matej Balog;Alexander L. Gaunt;Marc Brockschmidt;Sebastian Nowozin;Daniel Tarlow", "authorids": "matej.balog@gmail.com;t-algaun@microsoft.com;mabrocks@microsoft.com;Sebastian.Nowozin@microsoft.com;dtarlow@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nbalog2017deepcoder,\ntitle={DeepCoder: Learning to Write Programs},\nauthor={Matej Balog and Alexander L. Gaunt and Marc Brockschmidt and Sebastian Nowozin and Daniel Tarlow},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ByldLrqlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByldLrqlx", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;2", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 19, "authors#_avg": 5, "corr_rating_confidence": -1.0, "gs_citation": 752, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14663434925594619820&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "BymIbLKgl", "title": "Learning Invariant Representations Of Planar Curves", "track": "main", "status": "Poster", "tldr": "", "abstract": "We propose a metric learning framework for the construction of invariant geometric\nfunctions of planar curves for the Euclidean and Similarity group of transformations.\nWe leverage on the representational power of convolutional neural\nnetworks to compute these geometric quantities. In comparison with axiomatic\nconstructions, we show that the invariants approximated by the learning architectures\nhave better numerical qualities such as robustness to noise, resiliency to\nsampling, as well as the ability to adapt to occlusion and partiality. Finally, we develop\na novel multi-scale representation in a similarity metric learning paradigm.", "keywords": "Computer vision;Deep learning;Supervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Gautam Pai;Aaron Wetzler;Ron Kimmel", "authorids": "paigautam@cs.technion.ac.il;twerd@cs.technion.ac.il;ron@cs.technion.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\npai2017learning,\ntitle={Learning Invariant Representations Of Planar Curves },\nauthor={Gautam Pai and Aaron Wetzler and Ron Kimmel},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BymIbLKgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BymIbLKgl", "pdf_size": 0, "rating": "5;6;8", "confidence": "2;5;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.1428571428571429, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9026327461481252224&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "ByqiJIqxg", "title": "Online Bayesian Transfer Learning for Sequential Data Modeling", "track": "main", "status": "Poster", "tldr": "", "abstract": "We consider the problem of inferring a sequence of hidden states associated with a sequence of observations produced by an individual within a population. Instead of learning a single sequence model for the population (which does not account for variations within the population), we learn a set of basis sequence models based on different individuals. The sequence of hidden states for a new individual is inferred in an online fashion by estimating a distribution over the basis models that best explain the sequence of observations of this new individual. We explain how to do this in the context of hidden Markov models with Gaussian mixture models that are learned based on streaming data by online Bayesian moment matching. The resulting transfer learning technique is demonstrated with three real-word applications: activity recognition based on smartphone sensors, sleep classification based on electroencephalography data and the prediction of the direction of future packet flows between a pair of servers in telecommunication networks. ", "keywords": "Unsupervised Learning;Transfer Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Priyank Jaini;Zhitang Chen;Pablo Carbajal;Edith Law;Laura Middleton;Kayla Regan;Mike Schaekermann;George Trimponias;James Tung;Pascal Poupart", "authorids": "pjaini@uwaterloo.ca;chenzhitang2@huawei.com;pablo@veedata.io;edith.law@uwaterloo.ca;lmiddlet@uwaterloo.ca;kregan@uwaterloo.ca;mschaekermann@uwaterloo.ca;g.trimponias@huawei.com;james.tung@uwaterloo.ca;ppoupart@uwaterloo.ca", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\njaini2017online,\ntitle={Online Bayesian Transfer Learning for Sequential Data Modeling},\nauthor={Priyank Jaini and Zhitang Chen and Pablo Carbajal and Edith Law and Laura Middleton and Kayla Regan and Mike Schaekermann and George Trimponias and James Tung and Pascal Poupart},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ByqiJIqxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByqiJIqxg", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.0, "replies_avg": 17, "authors#_avg": 10, "corr_rating_confidence": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8496305941856158297&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "BysZhEqee", "title": "Marginal Deep Architectures: Deep learning for Small and Middle Scale Applications", "track": "main", "status": "Reject", "tldr": "", "abstract": "In recent years, many deep architectures have been proposed in different fields. However, to obtain good results, most of the previous deep models need a large number of training data. In this paper, for small and middle scale applications, we\npropose a novel deep learning framework based on stacked feature learning models. Particularly, we stack marginal Fisher analysis (MFA) layer by layer for the initialization of the deep architecture and call it \u201cMarginal Deep Architectures\u201d (MDA). In the implementation of MDA, the weight matrices of MFA are first learned layer by layer, and then we exploit some deep learning techniques, such as back propagation, dropout and denoising to fine tune the network. To evaluate the effectiveness of MDA, we have compared it with some feature learning methods and deep learning models on 7 small and middle scale real-world applications, including handwritten digits recognition, speech recognition, historical document understanding, image classification, action recognition and so on. Extensive experiments demonstrate that MDA performs not only better than shallow feature learning models, but also state-of-the-art deep learning models in these applications.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuchen Zheng;Guoqiang Zhong;Junyu Dong", "authorids": "ouczyc@outlook.com;gqzhong@ouc.edu.cn;dongjunyu@ouc.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzheng2017marginal,\ntitle={Marginal Deep Architectures: Deep learning for Small and Middle Scale Applications},\nauthor={Yuchen Zheng and Guoqiang Zhong and Junyu Dong},\nyear={2017},\nurl={https://openreview.net/forum?id=BysZhEqee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BysZhEqee", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13391639890348458227&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BysvGP5ee", "title": "Variational Lossy Autoencoder", "track": "main", "status": "Poster", "tldr": "A VAE that provably learns global structure of images with a local PixelCNN decoder.", "abstract": "Representation learning seeks to expose certain aspects of observed data in a learned representation that's amenable to downstream tasks like classification. \nFor instance, a good representation for 2D images might be one that describes only global structure and discards information about detailed texture. \nIn this paper, we present a simple but principled method to learn such global representations by combining Variational Autoencoder (VAE) with neural autoregressive models such as RNN, MADE and PixelRNN/CNN. \nOur proposed VAE model allows us to have control over what the global latent code can learn and , by designing the architecture accordingly, we can force the global latent code to discard irrelevant information such as texture in 2D images, and hence the code only ``autoencodes'' data in a lossy fashion.\nIn addition, by leveraging autoregressive models as both prior distribution $p(z)$ and decoding distribution $p(x|z)$, we can greatly improve generative modeling performance of VAEs, achieving new state-of-the-art results on MNIST, OMNIGLOT and Caltech-101 as well as competitive results on CIFAR10. \n", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Xi Chen;Diederik P. Kingma;Tim Salimans;Yan Duan;Prafulla Dhariwal;John Schulman;Ilya Sutskever;Pieter Abbeel", "authorids": "peter@openai.com;dpkingma@openai.com;tim@openai.com;rocky@openai.com;prafulla@mit.edu;joschu@openai.com;ilyasu@openai.com;pieter@openai.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nchen2017variational,\ntitle={Variational Lossy Autoencoder},\nauthor={Xi Chen and Diederik P. Kingma and Tim Salimans and Yan Duan and Prafulla Dhariwal and John Schulman and Ilya Sutskever and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=BysvGP5ee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BysvGP5ee", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 26, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 856, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11833073722642726902&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "ByvJuTigl", "title": "End-to-End Learnable Histogram Filters", "track": "main", "status": "Reject", "tldr": "a way to combine the algorithmic structure of Bayes filters with the end-to-end learnability of neural networks", "abstract": "Problem-specific algorithms and generic machine learning approaches have complementary strengths and weaknesses, trading-off data efficiency and generality. To find the right balance between these, we propose to use problem-specific information encoded in algorithms together with the ability to learn details about the problem-instance from data. We demonstrate this approach in the context of state estimation in robotics, where we propose end-to-end learnable histogram filters---a differentiable implementation of histogram filters that encodes the structure of recursive state estimation using prediction and measurement update but allows the specific models to be learned end-to-end, i.e. in such a way that they optimize the performance of the filter, using either supervised or unsupervised learning.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Rico Jonschkowski;Oliver Brock", "authorids": "rico.jonschkowski@tu-berlin.de;oliver.brock@tu-berlin.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\njonschkowski2017endtoend,\ntitle={End-to-End Learnable Histogram Filters},\nauthor={Rico Jonschkowski and Oliver Brock},\nyear={2017},\nurl={https://openreview.net/forum?id=ByvJuTigl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByvJuTigl", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;3;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.0, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7003864805962833794&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "Byx5BTilg", "title": "Exploring the Application of Deep Learning for Supervised Learning Problems", "track": "main", "status": "Reject", "tldr": "We explore the multiple DNN architectures on a large set of general supervised datasets. We also propose a meta-learning approach for DNN performance prediciton and ranking", "abstract": "One of the main difficulties in applying deep neural nets (DNNs) to new domains is the need to explore multiple architectures in order to discover ones that perform well. We analyze a large set of DNNs across multiple domains and derive insights regarding their effectiveness. We also analyze the characteristics of various DNNs and the general effect they may have on performance. Finally, we explore the application of meta-learning to the problem of architecture ranking. We demonstrate that by using topological features and modeling the changes in its weights, biases and activation functions layers of the initial training steps, we are able to rank architectures based on their predicted performance. We consider this work to be a first step in the important and challenging direction of exploring the space of different neural network architectures. \n", "keywords": "Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Jose Rozanec;Gilad Katz;Eui Chul Richard Shin;Dawn Song", "authorids": "jmrozanec@gmail.com;giladk@berkeley.edu;ricshin@berkeley.edu;dawnsong@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nrozanec2017exploring,\ntitle={Exploring the Application of Deep Learning for Supervised Learning Problems},\nauthor={Jose Rozanec and Gilad Katz and Eui Chul Richard Shin and Dawn Song},\nyear={2017},\nurl={https://openreview.net/forum?id=Byx5BTilg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Byx5BTilg", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;3;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TlbqEIVvp5wJ:scholar.google.com/&scioq=Exploring+the+Application+of+Deep+Learning+for+Supervised+Learning+Problems&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "ByxpMd9lx", "title": "Transfer Learning for Sequence Tagging with Hierarchical Recurrent Networks", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent papers have shown that neural networks obtain state-of-the-art performance on several different sequence tagging tasks. One appealing property of such systems is their generality, as excellent performance can be achieved with a unified architecture and without task-specific feature engineering. However, it is unclear if such systems can be used for tasks without large amounts of training data. In this paper we explore the problem of transfer learning for neural sequence taggers, where a source task with plentiful annotations (e.g., POS tagging on Penn Treebank) is used to improve performance on a target task with fewer available annotations (e.g., POS tagging for microblogs). We examine the effects of transfer learning for deep hierarchical recurrent networks across domains, applications, and languages, and show that significant improvement can often be obtained. These improvements lead to improvements over the current state-of-the-art on several well-studied tasks.", "keywords": "Natural language processing;Deep learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Zhilin Yang;Ruslan Salakhutdinov;William W. Cohen", "authorids": "zhiliny@cs.cmu.edu;rsalakhu@cs.cmu.edu;wcohen@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyang2017transfer,\ntitle={Transfer Learning for Sequence Tagging with Hierarchical Recurrent Networks},\nauthor={Zhilin Yang and Ruslan Salakhutdinov and William W. Cohen},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ByxpMd9lx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=ByxpMd9lx", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 432, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=940864865858402058&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "H12GRgcxg", "title": "Training deep neural-networks using a noise adaptation layer", "track": "main", "status": "Poster", "tldr": "Training neural network with noisy labels", "abstract": "The availability of large datsets has enabled neural networks to achieve impressive recognition results. However, the presence of inaccurate class labels is known to deteriorate the performance of even the best classifiers in a broad range of classification problems. Noisy labels also tend to be more harmful than noisy attributes. When the observed label is noisy, we can view the correct label as a latent random variable and model the noise processes by a communication channel with unknown parameters. Thus we can apply the EM algorithm to find the parameters of both the network and the noise and to estimate the correct label. In this study we present a neural-network approach that optimizes the same likelihood function as optimized by the EM algorithm. The noise is explicitly modeled by an additional softmax layer that connects the correct labels to the noisy ones. This scheme is then extended to the case where the noisy labels are dependent on the features in addition to the correct labels. Experimental results demonstrate that this approach outperforms previous methods.\n", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Jacob Goldberger;Ehud Ben-Reuven", "authorids": "jacob.goldberger@biu.ac.il;udi.benreuven@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ngoldberger2017training,\ntitle={Training deep neural-networks using a noise adaptation layer},\nauthor={Jacob Goldberger and Ehud Ben-Reuven},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=H12GRgcxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H12GRgcxg", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;5;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 847, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16339065007538213436&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "H13F3Pqll", "title": "Inverse Problems in Computer Vision using Adversarial Imagination Priors", "track": "main", "status": "Reject", "tldr": "We present a model that given a visual image learns to generate imaginations of complete scenes, albedo, shading etc, by using adversarial data driven priors on the imaginations spaces.", "abstract": "Given an image, humans effortlessly run the image formation process backwards in their minds: they can tell albedo from shading, foreground from background, and imagine the occluded parts of the scene behind foreground objects. In this work, we propose a weakly supervised inversion machine trained to generate similar imaginations that when rendered using differentiable, graphics-like decoders, produce the original visual input. We constrain the imagination spaces by providing exemplar memory repositories in the form of foreground segmented objects, albedo, shading, background scenes and imposing adversarial losses on the imagination spaces. Our model learns to perform such inversion with weak supervision, without ever having seen paired annotated data, that is, without having seen the image paired with the corresponding ground-truth imaginations. We demonstrate our method by applying it to three Computer Vision tasks: image in-painting, intrinsic decomposition and object segmentation, each task having its own differentiable renderer. Data driven adversarial imagination priors effectively guide inversion, minimize the need for hand designed priors of smoothness or good continuation, or the need for paired annotated data.", "keywords": "Unsupervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Hsiao-Yu Fish Tung;Katerina Fragkiadaki", "authorids": "htung@cs.cmu.edu;katef@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntung2017inverse,\ntitle={Inverse Problems in Computer Vision using Adversarial Imagination Priors},\nauthor={Hsiao-Yu Fish Tung and Katerina Fragkiadaki},\nyear={2017},\nurl={https://openreview.net/forum?id=H13F3Pqll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H13F3Pqll", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;3;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": -0.9449111825230683, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15795058934861634500&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "H178hw9ex", "title": "Dynamic Steerable Frame Networks", "track": "main", "status": "Reject", "tldr": "Introducing non-orthogonal and overcomplete bases for ConvNets and derive Dynamic Steerable Frame Networks, a hybrid of Dynamic Filter Networks and Spatial Transformers.", "abstract": "Filters in a convolutional network are typically parametrized in a pixel basis. As an orthonormal basis, pixels may represent any arbitrary vector in Rn. In this paper, we relax this orthonormality requirement and extend the set of viable bases to the generalized notion of frames. When applying suitable frame bases to ResNets on Cifar-10+ we demonstrate improved error rates by substitution only. By exploiting the transformation properties of such generalized bases, we arrive at steerable frames, that allow to continuously transform CNN filters under arbitrary Lie-groups. Further allowing us to locally separate pose from canonical appearance. We implement this in the Dynamic Steerable Frame Network, that dynamically estimates the transformations of filters, conditioned on its input. The derived method presents a hybrid of Dynamic Filter Networks and Spatial Transformer Networks that can be implemented in any convolutional architecture, as we illustrate in two examples. First, we illustrate estimation properties of steerable frames with a Dynamic Steerable Frame Network, compared to a Dynamic Filter Network on the task of edge detection, where we show clear advantages of the derived steerable frames. Lastly, we insert the Dynamic Steerable Frame Network as a module in a convolutional LSTM on the task of limited-data hand-gesture recognition from video and illustrate effective dynamic regularization and show clear advantages over Spatial Transformer Networks. In this paper, we have laid out the foundations of Frame-based convolutional networks and Dynamic Steerable Frame Networks while illustrating their advantages for continuously transforming features and data-efficient learning.", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "J\u00f6rn-Henrik Jacobsen;Bert De Brabandere;Arnold W.M. Smeulders", "authorids": "j.jacobsen@uva.nl;bert.debrabandere@esat.kuleuven.be;a.w.m.smeulders@uva.nl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\njacobsen2017dynamic,\ntitle={Dynamic Steerable Frame Networks},\nauthor={J{\\\"o}rn-Henrik Jacobsen and Bert De Brabandere and Arnold W.M. Smeulders},\nyear={2017},\nurl={https://openreview.net/forum?id=H178hw9ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H178hw9ex", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.18898223650461363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oVwfEKxYu50J:scholar.google.com/&scioq=Dynamic+Steerable+Frame+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "H1Fk2Iqex", "title": "Fast Chirplet Transform to Enhance CNN Machine Listening - Validation on Animal calls and Speech", "track": "main", "status": "Workshop", "tldr": "Proposing a chirplet transform in order to regulate the input of deep-CNN and possible extension to chirplet learning for deep learning bioacoustics", "abstract": "The scattering framework offers an optimal hierarchical convolutional decomposition according to its kernels. Convolutional Neural Net (CNN) can be seen asan optimal kernel decomposition, nevertheless it requires large amount of trainingdata to learn its kernels. We propose a trade-off between these two approaches: a Chirplet kernel as an efficient Q constant bioacoustic representation to pretrainCNN. First we motivate Chirplet bioinspired auditory representation. Second we give the first algorithm (and code) of a Fast Chirplet Transform (FCT). Third, we demonstrate the computation efficiency of FCT on large environmental data base: months of Orca recordings, and 1000 Birds species from the LifeClef challenge. Fourth, we validate FCT on the vowels subset of the Speech TIMIT dataset. The results show that FCT accelerates CNN when it pretrains low level layers: it reduces training duration by -28% for birds classification, and by -26% for vowels classification. Scores are also enhanced by FCT pretraining, with a relative gain of +7.8% of Mean Average Precision on birds, and +2.3% of vowel accuracy against raw audio CNN. We conclude on perspectives on tonotopic FCT deep machine listening, and inter-species bioacoustic transfer learning to generalise the representation of animal communication systems.", "keywords": "Applications;Supervised Learning;Deep learning;Speech", "primary_area": "", "supplementary_material": "", "author": "Herve Glotin;Julien Ricard;Randall Balestriero", "authorids": "glotin@univ-tln.fr;julien.ricard@gmail.com;randallbalestriero@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1Fk2Iqex", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;5;3", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11043567411643766391&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "H1GEvHcee", "title": "Annealing Gaussian into ReLU: a New Sampling Strategy for Leaky-ReLU RBM", "track": "main", "status": "Reject", "tldr": "We study fundamental property of leaky RBM. We link the leaky RBM and truncated Gaussian distribution and propose a novel sampling algorithm without additional computation cost.", "abstract": "Restricted Boltzmann Machine (RBM) is a bipartite graphical model that is used as the building block in energy-based deep generative models. Due to numerical stability and quantifiability of the likelihood, RBM is commonly used with Bernoulli units. Here, we consider an alternative member of exponential family RBM with leaky rectified linear units -- called leaky RBM. We first study the joint and marginal distributions of leaky RBM under different leakiness, which provides us important insights by connecting the leaky RBM model and truncated Gaussian distributions. The connection leads us to a simple yet efficient method for sampling from this model, where the basic idea is to anneal the leakiness rather than the energy; -- i.e., start from a fully Gaussian/Linear unit and gradually decrease the leakiness over iterations. This serves as an alternative to the annealing of the temperature parameter and enables numerical estimation of the likelihood that are more efficient and more accurate than the commonly used annealed importance sampling (AIS). We further demonstrate that the proposed sampling algorithm enjoys faster mixing property than contrastive divergence algorithm, which benefits the training without any additional computational cost.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Chun-Liang Li;Siamak Ravanbakhsh;Barnabas Poczos", "authorids": "chunlial@cs.cmu.edu;mravanba@cs.cmu.edu;bapoczos@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli2017annealing,\ntitle={Annealing Gaussian into Re{LU}: a New Sampling Strategy for Leaky-Re{LU} {RBM}},\nauthor={Chun-Liang Li and Siamak Ravanbakhsh and Barnabas Poczos},\nyear={2017},\nurl={https://openreview.net/forum?id=H1GEvHcee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1GEvHcee", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1769621892674240949&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "H1Go7Koex", "title": "Character-aware Attention Residual Network for Sentence Representation", "track": "main", "status": "Reject", "tldr": "We propose a character-aware attention residual network for short text representation.", "abstract": "Text classification in general is a well studied area. However, classifying short and noisy text remains challenging. Feature sparsity is a major issue. The quality of document representation here has a great impact on the classification accuracy. Existing methods represent text using bag-of-word model, with TFIDF or other weighting schemes. Recently word embedding and even document embedding are proposed to represent text. The purpose is to capture features at both word level and sentence level. However, the character level information are usually ignored. In this paper, we take word morphology and word semantic meaning into consideration, which are represented by character-aware embedding and word distributed embedding. By concatenating both character-level and word distributed embedding together and arranging words in order, a sentence representation matrix could be obtained. To overcome data sparsity problem of short text, sentence representation vector is then derived based on different views from sentence representation matrix. The various views contributes to the construction of an enriched sentence embedding. We employ a residual network on the sentence embedding to get a consistent and refined sentence representation. Evaluated on a few short text datasets, our model outperforms state-of-the-art models.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Xin Zheng;Zhenzhou Wu", "authorids": "xzheng008@e.ntu.edu.sg;zhenzhou.wu@sap.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzheng2017characteraware,\ntitle={Character-aware Attention Residual Network for Sentence Representation},\nauthor={Xin Zheng and Zhenzhou Wu},\nyear={2017},\nurl={https://openreview.net/forum?id=H1Go7Koex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1Go7Koex", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;5", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10409567200070031453&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "H1Gq5Q9el", "title": "Unsupervised Pretraining for Sequence to Sequence Learning", "track": "main", "status": "Reject", "tldr": "Pretraining seq2seq models gives large gains in both generalization and optimization on a variety of tasks.", "abstract": "This work presents a general unsupervised learning method to improve\nthe accuracy of sequence to sequence (seq2seq) models. In our method, the\nweights of the encoder and decoder of a seq2seq model are initialized\nwith the pretrained weights of two language models and then \nfine-tuned with labeled data. We apply this method to\nchallenging benchmarks in machine translation and abstractive\nsummarization and find that it significantly improves the subsequent\nsupervised models. Our main result is that the pretraining\naccelerates training and improves generalization of seq2seq models,\nachieving state-of-the-art results on the WMT\nEnglish->German task, surpassing a range of methods using\nboth phrase-based machine translation and neural machine\ntranslation. Our method achieves an improvement of 1.3 BLEU from the\nprevious best models on both WMT'14 and WMT'15\nEnglish->German. On summarization, our method beats\nthe supervised learning baseline.", "keywords": "Natural language processing;Deep learning;Semi-Supervised Learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Prajit Ramachandran;Peter J. Liu;Quoc V. Le", "authorids": "prajitram@gmail.com;peterjliu@google.com;qvl@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nramachandran2017unsupervised,\ntitle={Unsupervised Pretraining for Sequence to Sequence Learning},\nauthor={Prajit Ramachandran and Peter J. Liu and Quoc V. Le},\nyear={2017},\nurl={https://openreview.net/forum?id=H1Gq5Q9el}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=H1Gq5Q9el", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;4;5", "rating_avg": 6.0, "confidence_avg": 4.666666666666667, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 367, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6622750447258456990&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "H1Heentlx", "title": "Deep Variational Canonical Correlation Analysis", "track": "main", "status": "Reject", "tldr": "A deep generative model for multi-view representation learning", "abstract": "We present deep variational canonical correlation analysis (VCCA), a deep multi-view learning model that extends the latent variable model interpretation of linear CCA~\\citep{BachJordan05a} to nonlinear observation models parameterized by deep neural networks (DNNs). Computing the marginal data likelihood, as well as inference of the latent variables, are intractable under this model. We derive a variational lower bound of the data likelihood by parameterizing the posterior density of the latent variables with another DNN, and approximate the lower bound via Monte Carlo sampling. Interestingly, the resulting model resembles that of multi-view autoencoders~\\citep{Ngiam_11b}, with the key distinction of an additional sampling procedure at the bottleneck layer. We also propose a variant of VCCA called VCCA-private which can, in addition to the ``common variables'' underlying both views, extract the ``private variables'' within each view. We demonstrate that VCCA-private is able to disentangle the shared and private information for multi-view data without hard supervision.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Weiran Wang;Xinchen Yan;Honglak Lee;Karen Livescu", "authorids": "weiranwang@ttic.edu;xcyan@umich.edu;honglak@umich.edu;klivescu@ttic.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwang2017deep,\ntitle={Deep Variational Canonical Correlation Analysis},\nauthor={Weiran Wang and Xinchen Yan and Honglak Lee and Karen Livescu},\nyear={2017},\nurl={https://openreview.net/forum?id=H1Heentlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1Heentlx", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 188, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2809500028500303315&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "H1MjAnqxg", "title": "Intelligible Language Modeling with Input Switched Affine Networks", "track": "main", "status": "Reject", "tldr": "Input Switched Affine Networks combine intelligibility with performance for character level language modeling. ", "abstract": "The computational mechanisms by which nonlinear recurrent neural networks (RNNs) achieve their goals remains an open question. There exist many problem domains where intelligibility of the network model is crucial for deployment. Here we introduce a recurrent architecture composed of input-switched affine transformations, in other words an RNN without any nonlinearity and with one set of weights per input.\nWe show that this architecture achieves near identical performance to traditional architectures on language modeling of Wikipedia text, for the same number of model parameters. \nIt can obtain this performance with the potential for computational speedup compared to existing methods, by precomputing the composed affine transformations corresponding to longer input sequences. \nAs our architecture is affine, we are able to understand the mechanisms by which it functions using linear methods. For example, we show how the network linearly combines contributions from the past to make predictions at the current time step. We show how representations for words can be combined in order to understand how context is transferred across word boundaries. Finally, we demonstrate how the system can be executed and analyzed in arbitrary bases to aid understanding.", "keywords": "Natural language processing;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Jakob Foerster;Justin Gilmer;Jan Chorowski;Jascha Sohl-dickstein;David Sussillo", "authorids": "jakob.foerster@cs.ox.ac.uk;gilmer@google.com;jan.chorowski@cs.uni.wroc.pl;jaschasd@google.com;sussillo@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nfoerster2017intelligible,\ntitle={Intelligible Language Modeling with Input Switched Affine Networks},\nauthor={Jakob Foerster and Justin Gilmer and Jan Chorowski and Jascha Sohl-dickstein and David Sussillo},\nyear={2017},\nurl={https://openreview.net/forum?id=H1MjAnqxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=H1MjAnqxg", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14992605409353645504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "H1VyHY9gg", "title": "Data Noising as Smoothing in Neural Network Language Models", "track": "main", "status": "Poster", "tldr": "Derive data noising schemes for neural network language models corresponding to techniques in n-gram smoothing.", "abstract": "Data noising is an effective technique for regularizing neural network models. While noising is widely adopted in application domains such as vision and speech, commonly used noising primitives have not been developed for discrete sequence-level settings such as language modeling. In this paper, we derive a connection between input noising in neural network language models and smoothing in n-gram models. Using this connection, we draw upon ideas from smoothing to develop effective noising schemes. We demonstrate performance gains when applying the proposed schemes to language modeling and machine translation. Finally, we provide empirical analysis validating the relationship between noising and smoothing.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Ziang Xie;Sida I. Wang;Jiwei Li;Daniel L\u00e9vy;Aiming Nie;Dan Jurafsky;Andrew Y. Ng", "authorids": "zxie@cs.stanford.edu;sidaw@cs.stanford.edu;jiweil@stanford.edu;danilevy@cs.stanford.edu;anie@cs.stanford.edu;jurafsky@stanford.edu;ang@cs.stanford.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nxie2017data,\ntitle={Data Noising as Smoothing in Neural Network Language Models},\nauthor={Ziang Xie and Sida I. Wang and Jiwei Li and Daniel L{\\'e}vy and Aiming Nie and Dan Jurafsky and Andrew Y. Ng},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=H1VyHY9gg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=H1VyHY9gg", "pdf_size": 0, "rating": "6;6;8", "confidence": "0;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 11, "authors#_avg": 7, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 321, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4606432760581617114&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "H1W1UN9gg", "title": "Deep Information Propagation", "track": "main", "status": "Poster", "tldr": "We predict whether randomly initialized neural networks can be trained by studying whether or not information can travel through them.", "abstract": "We study the behavior of untrained neural networks whose weights and biases are randomly distributed using mean field theory. We show the existence of depth scales that naturally limit the maximum depth of signal propagation through these random networks. Our main practical result is to show that random networks may be trained precisely when information can travel through them. Thus, the depth scales that we identify provide bounds on how deep a network may be trained for a specific choice of hyperparameters. As a corollary to this, we argue that in networks at the edge of chaos, one of these depth scales diverges. Thus arbitrarily deep networks may be trained only sufficiently close to criticality. We show that the presence of dropout destroys the order-to-chaos critical point and therefore strongly limits the maximum trainable depth for random networks. Finally, we develop a mean field theory for backpropagation and we show that the ordered and chaotic phases correspond to regions of vanishing and exploding gradient respectively.", "keywords": "Theory;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Samuel S. Schoenholz;Justin Gilmer;Surya Ganguli;Jascha Sohl-Dickstein", "authorids": "schsam@google.com;gilmer@google.com;sganguli@stanford.edu;jaschasd@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nschoenholz2017deep,\ntitle={Deep Information Propagation},\nauthor={Samuel S. Schoenholz and Justin Gilmer and Surya Ganguli and Jascha Sohl-Dickstein},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=H1W1UN9gg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1W1UN9gg", "pdf_size": 0, "rating": "8;8;9", "confidence": "2;3;4", "rating_avg": 8.333333333333334, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 460, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3674931930385848322&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "H1_EDpogx", "title": "Near-Data Processing for Machine Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In computer architecture, near-data processing (NDP) refers to augmenting the memory or the storage with processing power so that it can process the data stored therein. By offloading the computational burden of CPU and saving the need for transferring raw data in its entirety, NDP exhibits a great potential for acceleration and power reduction. Despite this potential, specific research activities on NDP have witnessed only limited success until recently, often owing to performance mismatches between logic and memory process technologies that put a limit on the processing capability of memory. Recently, there have been two major changes in the game, igniting the resurgence of NDP with renewed interest. The first is the success of machine learning (ML), which often demands a great deal of computation for training, requiring frequent transfers of big data. The second is the advent of NAND flash-based solid-state drives (SSDs) containing multicore processors that can accommodate extra computation for data processing. Sparked by these application needs and technological support, we evaluate the potential of NDP for ML using a new SSD platform that allows us to simulate in-storage processing (ISP) of ML workloads. Our platform (named ISP-ML) is a full-fledged simulator of a realistic multi-channel SSD that can execute various ML algorithms using the data stored in the SSD. For thorough performance analysis and in-depth comparison with alternatives, we focus on a specific algorithm: stochastic gradient decent (SGD), which is the de facto standard for training differentiable learning machines including deep neural networks. We implement and compare three variants of SGD (synchronous, Downpour, and elastic averaging) using ISP-ML, exploiting the multiple NAND channels for parallelizing SGD. In addition, we compare the performance of ISP and that of conventional in-host processing, revealing the advantages of ISP. Based on the advantages and limitations identified through our experiments, we further discuss directions for future research on ISP for accelerating ML.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyeokjun Choe;Seil Lee;Hyunha Nam;Seongsik Park;Seijoon Kim;Eui-Young Chung;Sungroh Yoon", "authorids": "genesis1104@snu.ac.kr;lees231@dsl.snu.ac.kr;godqhr825@snu.ac.kr;pss015@snu.ac.kr;hokiespa@snu.ac.kr;eychung@yonsei.ac.kr;sryoon@snu.ac.kr", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nchoe2017neardata,\ntitle={Near-Data Processing for Machine Learning},\nauthor={Hyeokjun Choe and Seil Lee and Hyunha Nam and Seongsik Park and Seijoon Kim and Eui-Young Chung and Sungroh Yoon},\nyear={2017},\nurl={https://openreview.net/forum?id=H1_EDpogx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1_EDpogx", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;2;2", "rating_avg": 5.0, "confidence_avg": 2.6666666666666665, "replies_avg": 11, "authors#_avg": 7, "corr_rating_confidence": -0.8660254037844387, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "H1_QSDqxl", "title": "Rule Mining in Feature Space", "track": "main", "status": "Reject", "tldr": "We propose an algorithm to discover logical theories from relational embeddings of knowledge bases.", "abstract": "Relational embeddings have emerged as an excellent tool for inferring novel facts\nfrom partially observed knowledge bases. Recently, it was shown that some\nclasses of embeddings can also be exploited to perform a simplified form of rule\nmining. By interpreting logical conjunction as a form of composition between re-\nlation embeddings, simplified logical theories can be mined directly in the space\nof latent representations. In this paper, we present a method to mine full-fledged\nlogical theories, which are significantly more expressive, by casting the semantics\nof the logical operators to the space of the embeddings. In order to extract relevant\nrules in the space of relation compositions we borrow sparse reconstruction pro-\ncedures from the field of compressed sensing. Our empirical analysis showcases\nthe advantages of our approach.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Stefano Teso;Andrea Passerini", "authorids": "teso@disi.unitn.it;passerini@disi.unitn.it", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nteso2017rule,\ntitle={Rule Mining in Feature Space},\nauthor={Stefano Teso and Andrea Passerini},\nyear={2017},\nurl={https://openreview.net/forum?id=H1_QSDqxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1_QSDqxl", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "H1acq85gx", "title": "Maximum Entropy Flow Networks", "track": "main", "status": "Poster", "tldr": "", "abstract": "Maximum entropy modeling is a flexible and popular framework for formulating statistical models given partial knowledge. In this paper, rather than the traditional method of optimizing over the continuous density directly, we learn a smooth and invertible transformation that maps a simple distribution to the desired maximum entropy distribution. Doing so is nontrivial in that the objective being maximized (entropy) is a function of the density itself. By exploiting recent developments in normalizing flow networks, we cast the maximum entropy problem into a finite-dimensional constrained optimization, and solve the problem by combining stochastic optimization with the augmented Lagrangian method. Simulation results demonstrate the effectiveness of our method, and applications to finance and computer vision show the flexibility and accuracy of using maximum entropy flow networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gabriel Loaiza-Ganem *;Yuanjun Gao *;John P. Cunningham", "authorids": "gl2480@columbia.edu;yg2312@columbia.edu;jpc2181@columbia.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\n*2017maximum,\ntitle={Maximum Entropy Flow Networks},\nauthor={Gabriel Loaiza-Ganem * and Yuanjun Gao * and John P. Cunningham},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=H1acq85gx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1acq85gx", "pdf_size": 0, "rating": "6;6;9", "confidence": "4;4;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 1.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17377222519342028013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "H1eLE8qlx", "title": "Options Discovery with Budgeted Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "The article describes a new learning model called Budgeted Option Neural Network (BONN) able to discover options based on a budgeted learning objective, and a new RL learning framework called Bi-POMDP.", "abstract": "We consider the problem of learning hierarchical policies for Reinforcement Learning able to discover options, an option corresponding to a sub-policy over a set of primitive actions. Different models have been proposed during the last decade that usually rely on a predefined set of options. We specifically address the problem of automatically discovering options in decision processes. We describe a new RL learning framework called Bi-POMDP, and a new learning model called Budgeted Option Neural Network (BONN) able to discover options based on a budgeted learning objective. Since Bi-POMDP are more general than POMDP, our model can also be used to discover options for classical RL tasks. The BONN model is evaluated on different classical RL problems, demonstrating both quantitative and qualitative interesting results.", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Aurelia L\u00e9on;Ludovic Denoyer", "authorids": "aurelia.leon@lip6.fr;ludovic.denoyer@lip6.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nl{\\'e}on2017options,\ntitle={Options Discovery with Budgeted Reinforcement Learning},\nauthor={Aurelia L{\\'e}on and Ludovic Denoyer},\nyear={2017},\nurl={https://openreview.net/forum?id=H1eLE8qlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=H1eLE8qlx", "pdf_size": 0, "rating": "4;4;5;5", "confidence": "5;4;4;5", "rating_avg": 4.5, "confidence_avg": 4.5, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11198521507532273289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "H1fl8S9ee", "title": "Learning and Policy Search in Stochastic Dynamical Systems with Bayesian Neural Networks", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present an algorithm for policy search in stochastic dynamical systems using\nmodel-based reinforcement learning. The system dynamics are described with\nBayesian neural networks (BNNs) that include stochastic input variables. These\ninput variables allow us to capture complex statistical\npatterns in the transition dynamics (e.g. multi-modality and\nheteroskedasticity), which are usually missed by alternative modeling approaches. After\nlearning the dynamics, our BNNs are then fed into an algorithm that performs\nrandom roll-outs and uses stochastic optimization for policy learning. We train\nour BNNs by minimizing $\\alpha$-divergences with $\\alpha = 0.5$, which usually produces better\nresults than other techniques such as variational Bayes. We illustrate the performance of our method by\nsolving a challenging problem where model-based approaches usually fail and by\nobtaining promising results in real-world scenarios including the control of a\ngas turbine and an industrial benchmark.", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Stefan Depeweg;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato;Finale Doshi-Velez;Steffen Udluft", "authorids": "stefan.depeweg@siemens.com;jmh233@cam.ac.uk;finale@seas.harvard.edu;steffen.udluft@siemens.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ndepeweg2017learning,\ntitle={Learning and Policy Search in Stochastic Dynamical Systems with Bayesian Neural Networks},\nauthor={Stefan Depeweg and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato and Finale Doshi-Velez and Steffen Udluft},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=H1fl8S9ee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=H1fl8S9ee", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 224, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14568373457880481181&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "id": "H1hoFU9xe", "title": "Generative Adversarial Networks for Image Steganography", "track": "main", "status": "Reject", "tldr": "We consider a new type of GAN model and apply it to secure image steganography", "abstract": "Steganography is collection of methods to hide secret information (\"payload\") within non-secret information (\"container\"). Its counterpart, Steganalysis, is the practice of determining if a message contains a hidden payload, and recovering it if possible. Presence of hidden payloads is typically detected by a binary classifier. In the present study, we propose a new model for generating image-like containers based on Deep Convolutional Generative Adversarial Networks (DCGAN). This approach allows to generate more setganalysis-secure message embedding using standard steganography algorithms. Experiment results demonstrate that the new model successfully deceives the steganography analyzer, and for this reason, can be used in steganographic applications.", "keywords": "Computer vision;Deep learning;Unsupervised Learning;Applications;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Denis Volkhonskiy;Boris Borisenko;Evgeny Burnaev", "authorids": "dvolkhonskiy@gmail.com;bborisenko@hse.ru;e.burnaev@skoltech.ru", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nvolkhonskiy2017generative,\ntitle={Generative Adversarial Networks for Image Steganography},\nauthor={Denis Volkhonskiy and Boris Borisenko and Evgeny Burnaev},\nyear={2017},\nurl={https://openreview.net/forum?id=H1hoFU9xe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1hoFU9xe", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;3;4", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17451082029247430771&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "H1kjdOYlx", "title": "Modular Multitask Reinforcement Learning with Policy Sketches", "track": "main", "status": "Workshop", "tldr": "Learning multitask deep hierarchical policies with guidance from symbolic policy sketches", "abstract": "We describe a framework for multitask deep reinforcement learning guided by\npolicy sketches. Sketches annotate each task with a sequence of named subtasks,\nproviding high-level structural relationships among tasks, but not providing the\ndetailed guidance required by previous work on learning policy abstractions for\nRL (e.g. intermediate rewards, subtask completion signals, or intrinsic motivations).\nOur approach associates every subtask with its own modular subpolicy,\nand jointly optimizes over full task-specific policies by tying parameters across\nshared subpolicies. This optimization is accomplished via a simple decoupled\nactor\u2013critic training objective that facilitates learning common behaviors from\ndissimilar reward functions. We evaluate the effectiveness of our approach on a\nmaze navigation game and a 2-D Minecraft-inspired crafting game. Both games\nfeature extremely sparse rewards that can be obtained only after completing a\nnumber of high-level subgoals (e.g. escaping from a sequence of locked rooms or\ncollecting and combining various ingredients in the proper order). Experiments\nillustrate two main advantages of our approach. First, we outperform standard\nbaselines that learn task-specific or shared monolithic policies. Second, our\nmethod naturally induces a library of primitive behaviors that can be recombined\nto rapidly acquire policies for new tasks.", "keywords": "Reinforcement Learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Jacob Andreas;Dan Klein;Sergey Levine", "authorids": "jda@cs.berkeley.edu;klein@cs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1kjdOYlx", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;5", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 601, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5436796240835430868&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "H1oRQDqlg", "title": "Learning to Draw Samples: With Application to Amortized MLE for Generative Adversarial Learning", "track": "main", "status": "Workshop", "tldr": "", "abstract": "We propose a simple algorithm to train stochastic neural networks to draw samples from given target distributions for probabilistic inference. Our method is based on iteratively adjusting the neural network parameters so that the output changes along a Stein variational gradient that maximumly decreases the KL divergence with the target distribution. Our method works for any target distribution specified by their unnormalized density function, and can train any black-box architectures that are differentiable in terms of the parameters we want to adapt. As an application of our method, we propose an amortized MLE algorithm for training deep energy model, where a neural sampler is adaptively trained to approximate the likelihood function. Our method mimics an adversarial game between the deep energy model and the neural sampler, and obtains realistic-looking images competitive with the state-of-the-art results.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Dilin Wang;Qiang Liu", "authorids": "dilin.wang.gr@dartmouth.edu;qiang.liu@dartmouth.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1oRQDqlg", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;3;3", "rating_avg": 4.0, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 209, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11221442924595490338&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "H1oyRlYgg", "title": "On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima", "track": "main", "status": "Oral", "tldr": "We present numerical evidence for the argument that if deep networks are trained using large (mini-)batches, they converge to sharp minimizers, and these minimizers have poor generalization properties. ", "abstract": "The stochastic gradient descent (SGD) method and its variants are algorithms of choice for many Deep Learning tasks. These methods operate in a small-batch regime wherein a fraction of the training data, say $32$--$512$ data points, is sampled to compute an approximation to the gradient. It has been observed in practice that when using a larger batch there is a degradation in the quality of the model, as measured by its ability to generalize. We investigate the cause for this generalization drop in the large-batch regime and present numerical evidence that supports the view that large-batch methods tend to converge to sharp minimizers of the training and testing functions---and as is well known, sharp minima lead to poorer generalization. In contrast, small-batch methods consistently converge to flat minimizers, and our experiments support a commonly held view that this is due to the inherent noise in the gradient estimation. We discuss several strategies to attempt to help large-batch methods eliminate this generalization gap.", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Nitish Shirish Keskar;Dheevatsa Mudigere;Jorge Nocedal;Mikhail Smelyanskiy;Ping Tak Peter Tang", "authorids": "keskar.nitish@u.northwestern.edu;dheevatsa.mudigere@intel.com;j-nocedal@northwestern.edu;mikhail.smelyanskiy@intel.com;peter.tang@intel.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nkeskar2017on,\ntitle={On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima},\nauthor={Nitish Shirish Keskar and Dheevatsa Mudigere and Jorge Nocedal and Mikhail Smelyanskiy and Ping Tak Peter Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=H1oyRlYgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1oyRlYgg", "pdf_size": 0, "rating": "6;8;10", "confidence": "4;3;3", "rating_avg": 8.0, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 4006, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2526562489715623205&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "H1wgawqxl", "title": "Nonparametrically Learning Activation Functions in Deep Neural Nets", "track": "main", "status": "Workshop", "tldr": "A new class of nonparametric activation functions for deep learning with theoretical guarantees for generalization error.", "abstract": "We provide a principled framework for nonparametrically learning activation functions in deep neural networks. Currently, state-of-the-art deep networks treat choice of activation function as a hyper-parameter before training. By allowing activation functions to be estimated as part of the training procedure, we expand the class of functions that each node in the network can learn. We also provide a theoretical justification for our choice of nonparametric activation functions and demonstrate that networks with our nonparametric activation functions generalize well. To demonstrate the power of our novel techniques, we test them on image recognition datasets and achieve up to a 15% relative increase in test performance compared to the baseline.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Carson Eisenach;Zhaoran Wang;Han Liu", "authorids": "eisenach@princeton.edu;zhaoran@princeton.edu;hanliu@princeton.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1wgawqxl", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;5", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12488821009628208897&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "H1zJ-v5xl", "title": "Quasi-Recurrent Neural Networks", "track": "main", "status": "Poster", "tldr": "QRNNs, composed of convolutions and a recurrent pooling function, outperform LSTMs on a variety of sequence tasks and are up to 16 times faster.", "abstract": "Recurrent neural networks are a powerful tool for modeling sequential data, but the dependence of each timestep\u2019s computation on the previous timestep\u2019s output limits parallelism and makes RNNs unwieldy for very long sequences. We introduce quasi-recurrent neural networks (QRNNs), an approach to neural sequence modeling that alternates convolutional layers, which apply in parallel across timesteps, and a minimalist recurrent pooling function that applies in parallel across channels. Despite lacking trainable recurrent layers, stacked QRNNs have better predictive accuracy than stacked LSTMs of the same hidden size. Due to their increased parallelism, they are up to 16 times faster at train and test time. Experiments on language modeling, sentiment classification, and character-level neural machine translation demonstrate these advantages and underline the viability of QRNNs as a basic building block for a variety of sequence tasks.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "James Bradbury;Stephen Merity;Caiming Xiong;Richard Socher", "authorids": "james.bradbury@salesforce.com;smerity@salesforce.com;cxiong@salesforce.com;rsocher@salesforce.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbradbury2017quasirecurrent,\ntitle={Quasi-Recurrent Neural Networks},\nauthor={James Bradbury and Stephen Merity and Caiming Xiong and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=H1zJ-v5xl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1zJ-v5xl", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 669, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4062513269935809949&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "HJ0NvFzxl", "title": "Learning Graphical State Transitions", "track": "main", "status": "Oral", "tldr": "I introduce a set of differentiable graph transformations, and use them to build a model with a graphical internal state that can extract structured data from text and use it to answer queries.", "abstract": "Graph-structured data is important in modeling relationships between multiple entities, and can be used to represent states of the world as well as many data structures. Li et al. (2016) describe a model known as a Gated Graph Sequence Neural Network (GGS-NN) that produces sequences from graph-structured input. In this work I introduce the Gated Graph Transformer Neural Network (GGT-NN), an extension of GGS-NNs that uses graph-structured data as an intermediate representation. The model can learn to construct and modify graphs in sophisticated ways based on textual input, and also to use the graphs to produce a variety of outputs. For example, the model successfully learns to solve almost all of the bAbI tasks (Weston et al., 2016), and also discovers the rules governing graphical formulations of a simple cellular automaton and a family of Turing machines.", "keywords": "Natural language processing;Deep learning;Supervised Learning;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Daniel D. Johnson", "authorids": "ddjohnson@hmc.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\njohnson2017learning,\ntitle={Learning Graphical State Transitions},\nauthor={Daniel D. Johnson},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJ0NvFzxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJ0NvFzxl", "pdf_size": 0, "rating": "7;9;9", "confidence": "2;3;3", "rating_avg": 8.333333333333334, "confidence_avg": 2.6666666666666665, "replies_avg": 19, "authors#_avg": 1, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4537707114593494686&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HJ0UKP9ge", "title": "Bidirectional Attention Flow for Machine Comprehension", "track": "main", "status": "Poster", "tldr": "", "abstract": "Machine comprehension (MC), answering a query about a given context paragraph, requires modeling complex interactions between the context and the query. Recently, attention mechanisms have been successfully extended to MC. Typically these methods use attention to focus on a small portion of the context and summarize it with a fixed-size vector, couple attentions temporally, and/or often form a uni-directional attention. In this paper we introduce the Bi-Directional Attention Flow (BIDAF) network, a multi-stage hierarchical process that represents the context at different levels of granularity and uses bi-directional attention flow mechanism to obtain a query-aware context representation without early summarization. Our experimental evaluations show that our model achieves the state-of-the-art results in Stanford Question Answering Dataset (SQuAD) and CNN/DailyMail cloze test.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Minjoon Seo;Aniruddha Kembhavi;Ali Farhadi;Hannaneh Hajishirzi", "authorids": "minjoon@cs.washington.edu;anik@allenai.org;alif@allenai.org;hannaneh@cs.washington.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nseo2017bidirectional,\ntitle={Bidirectional Attention Flow for Machine Comprehension},\nauthor={Minjoon Seo and Aniruddha Kembhavi and Ali Farhadi and Hannaneh Hajishirzi},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJ0UKP9ge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJ0UKP9ge", "pdf_size": 0, "rating": "7;8;8", "confidence": "5;4;5", "rating_avg": 7.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 27, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2454, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=199178676793208244&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HJ1JBJ5gl", "title": "Representing inferential uncertainty in deep neural networks through sampling", "track": "main", "status": "Reject", "tldr": "Dropout- and dropconnect-based Bayesian deep neural networks with sampling at inference better represent their own inferential uncertainty than traditional deep neural networks.", "abstract": "As deep neural networks (DNNs) are applied to increasingly challenging problems, they will need to be able to represent their own uncertainty. Modelling uncertainty is one of the key features of Bayesian methods. Bayesian DNNs that use dropout-based variational distributions and scale to complex tasks have recently been proposed. We evaluate Bayesian DNNs trained with Bernoulli or Gaussian multiplicative masking of either the units (dropout) or the weights (dropconnect). We compare these Bayesian DNNs ability to represent their uncertainty about their outputs through sampling during inference. We tested the calibration of these Bayesian fully connected and convolutional DNNs on two visual inference tasks (MNIST and CIFAR-10). By adding different levels of Gaussian noise to the test images, we assessed how these DNNs represented their uncertainty about regions of input space not covered by the training set. These Bayesian DNNs represented their own uncertainty more accurately than traditional DNNs with a softmax output. We find that sampling of weights, whether Gaussian or Bernoulli, led to more accurate representation of uncertainty compared to sampling of units. However, sampling units using either Gaussian or Bernoulli dropout led to increased convolutional neural network (CNN) classification accuracy. Based on these findings we use both Bernoulli dropout and Gaussian dropconnect concurrently, which approximates the use of a spike-and-slab variational distribution. We find that networks with spike-and-slab sampling combine the advantages of the other methods: they classify with high accuracy and robustly represent the uncertainty of their classifications for all tested architectures.", "keywords": "Deep learning;Theory;Applications", "primary_area": "", "supplementary_material": "", "author": "Patrick McClure;Nikolaus Kriegeskorte", "authorids": "Patrick.McClure@mrc-cbu.cam.ac.uk;Nikolaus.Kriegeskorte@mrc-cbu.cam.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmcclure2017representing,\ntitle={Representing inferential uncertainty in deep neural networks through sampling},\nauthor={Patrick McClure and Nikolaus Kriegeskorte},\nyear={2017},\nurl={https://openreview.net/forum?id=HJ1JBJ5gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJ1JBJ5gl", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16505316643195642572&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HJ1kmv9xx", "title": "LR-GAN: Layered Recursive Generative Adversarial Networks for Image Generation", "track": "main", "status": "Poster", "tldr": "A layered recursive GAN for image generation, which considers the structure in images and can disentangle the foreground objects from background well in unsupervised manner.", "abstract": "We present LR-GAN: an adversarial image generation model which takes scene structure and context into account. Unlike previous generative adversarial networks (GANs), the proposed GAN learns to generate image background and foregrounds separately and recursively, and stitch the foregrounds on the background in a contextually relevant manner to produce a complete natural image. For each foreground, the model learns to generate its appearance, shape and pose. The whole model is unsupervised, and is trained in an end-to-end manner with conventional gradient descent methods. The experiments demonstrate that LR-GAN can generate more natural images with objects that are more human recognizable than baseline GANs.", "keywords": "Computer vision;Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Jianwei Yang;Anitha Kannan;Dhruv Batra;Devi Parikh", "authorids": "jw2yang@vt.edu;akannan@fb.com;dbatra@gatech.edu;parikh@gatech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nyang2017lrgan,\ntitle={{LR}-{GAN}: Layered Recursive Generative Adversarial Networks for Image Generation},\nauthor={Jianwei Yang and Anitha Kannan and Dhruv Batra and Devi Parikh},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJ1kmv9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJ1kmv9xx", "pdf_size": 0, "rating": "6;6;7", "confidence": "0;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 2.3333333333333335, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.6933752452815363, "gs_citation": 297, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6637888010374914762&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HJ5PIaseg", "title": "Towards an automatic Turing test: Learning to evaluate dialogue responses", "track": "main", "status": "Workshop", "tldr": "We propose a model for evaluating dialogue responses that correlates significantly with human judgement at the utterance-level and system-level.", "abstract": "Automatically evaluating the quality of dialogue responses for unstructured domains is a challenging problem.\nUnfortunately, existing automatic evaluation metrics are biased and correlate very poorly with human judgements of response quality (Liu et al., 2016). Yet having an accurate automatic evaluation procedure is crucial for dialogue research, as it allows rapid prototyping and testing of new models with fewer expensive human evaluations. In response to this challenge, we formulate automatic dialogue evaluation as a learning problem. We present an evaluation model (ADEM) that learns to predict human-like scores to input responses, using a new dataset of human response scores. We show that the ADEM model's predictions correlate significantly, and at level much higher than word-overlap metrics such as BLEU, with human judgements at both the utterance and system-level. We also show that ADEM can generalize to evaluating dialogue models unseen during training, an important step for automatic dialogue evaluation.", "keywords": "Natural language processing;Applications", "primary_area": "", "supplementary_material": "", "author": "Ryan Lowe;Michael Noseworthy;Iulian V. Serban;Nicolas Angelard-Gontier;Yoshua Bengio;Joelle Pineau", "authorids": "rlowe1@cs.mcgill.ca;michael.noseworthy@mail.mcgill.ca;julianserban@gmail.com;nicolas.angelard-gontier@mail.mcgill.ca;yoshua.umontreal@gmail.com;jpineau@cs.mcgill.ca", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=HJ5PIaseg", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 453, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15470106720904286429&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "HJ6idTdgg", "title": "Pedestrian Detection Based On Fast R-CNN and Batch Normalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most of the pedestrian detection methods are based on hand-crafted features which produce low accuracy on complex scenes. With the development of deep learning method, pedestrian detection has achieved great success. In this paper, we take advantage of a convolutional neural network which is based on Fast R-CNN framework to extract robust pedestrian features for efficient and effective pedestrian detection in complicated environments. We use the EdgeBoxes algorithm to generate effective region proposals from an image, as the quality of extracted region proposals can greatly affect the detection performance. In order to reduce the training time and to improve the generalization performance, we add a batch normalization layer between the convolutional layer and the activation function layer. Experiments show that the proposed method achieves satisfactory performance on the INRIA and ETH datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhong-Qiu Zhao;Haiman Bian;Donghui Hu;Herve Glotin", "authorids": "z.zhao@hfut.edu.cn;bhm2164@163.com;hudh@hfut.edu.cn;h.glotin@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhao2017pedestrian,\ntitle={Pedestrian Detection Based On Fast R-{CNN} and Batch Normalization },\nauthor={Zhong-Qiu Zhao and Haiman Bian and Donghui Hu and Herve Glotin},\nyear={2017},\nurl={https://openreview.net/forum?id=HJ6idTdgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJ6idTdgg", "pdf_size": 0, "rating": "2;3;3;3", "confidence": "5;5;5;5", "rating_avg": 2.75, "confidence_avg": 5.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4068762802519021860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HJ7O61Yxe", "title": "Modelling Relational Time Series using Gaussian Embeddings", "track": "main", "status": "Reject", "tldr": "We learn latent gaussian distributions for modelling correlated series.", "abstract": "We address the problem of modeling multiple simultaneous time series where the observations are correlated not only inside each series, but among the different series. This problem happens in many domains such as ecology, meteorology, etc. We propose a new dynamical state space model, based on representation learning, for modeling the evolution of such series. The joint relational and temporal dynamics of the series are modeled as Gaussian distributions in a latent space. A decoder maps the latent representations to the observations. The two components (dynamic model and decoder) are jointly trained. Using stochastic representations allows us to model the uncertainty inherent to observations and to predict unobserved values together with a confidence in the prediction.", "keywords": "Applications;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Ludovic Dos Santos;Ali Ziat;Ludovic Denoyer;Benjamin Piwowarski;Patrick Gallinari", "authorids": "ludovic.dossantos@lip6.fr;ali.ziat@vedecom.fr;ludovic.denoyer@lip6.fr;benjamin.piwowarski@lip6.fr;patrick.gallinari@lip6.fr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nsantos2017modelling,\ntitle={Modelling Relational Time Series using Gaussian Embeddings},\nauthor={Ludovic Dos Santos and Ali Ziat and Ludovic Denoyer and Benjamin Piwowarski and Patrick Gallinari},\nyear={2017},\nurl={https://openreview.net/forum?id=HJ7O61Yxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJ7O61Yxe", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;3;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13663295211254398570&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HJ9rLLcxg", "title": "Dataset Augmentation in Feature Space", "track": "main", "status": "Workshop", "tldr": "We argue for domain-agnostic data augmentation in feature space by applying simple transformations to seq2seq context vectors.", "abstract": "Dataset augmentation, the practice of applying a wide array of domain-specific transformations to synthetically expand a training set, is a standard tool in supervised learning. While effective in tasks such as visual recognition, the set of transformations must be carefully designed, implemented, and tested for every new domain, limiting its re-use and generality. In this paper, we adopt a simpler, domain-agnostic approach to dataset augmentation. We start with existing data points and apply simple transformations such as adding noise, interpolating, or extrapolating between them. Our main insight is to perform the transformation not in input space, but in a learned feature space. A re-kindling of interest in unsupervised representation learning makes this technique timely and more effective. It is a simple proposal, but to-date one that has not been tested empirically. Working in the space of context vectors generated by sequence-to-sequence models, we demonstrate a technique that is effective for both static and sequential data.\n", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Terrance DeVries;Graham W. Taylor", "authorids": "terrance@uoguelph.ca;gwtaylor@uoguelph.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJ9rLLcxg", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;5;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": -0.7559289460184544, "gs_citation": 583, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4882158153802743824&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "HJDBUF5le", "title": "Towards a Neural Statistician", "track": "main", "status": "Poster", "tldr": "Learning representations of datasets with an extension of VAEs.", "abstract": "An efficient learner is one who reuses what they already know to tackle a new problem. For a machine learner, this means understanding the similarities amongst datasets. In order to do this, one must take seriously the idea of working with datasets, rather than datapoints, as the key objects to model. Towards this goal, we demonstrate an extension of a variational autoencoder that can learn a method for computing representations, or statistics, of datasets in an unsupervised fashion. The network is trained to produce statistics that encapsulate a generative model for each dataset. Hence the network enables efficient learning from new datasets for both unsupervised and supervised tasks. We show that we are able to learn statistics that can be used for: clustering datasets, transferring generative models to new datasets, selecting representative samples of datasets and classifying previously unseen classes. We refer to our model as a neural statistician, and by this we mean a neural network that can learn to compute summary statistics of datasets without supervision.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Harrison Edwards;Amos Storkey", "authorids": "h.l.edwards@sms.ed.ac.uk;amos.storkey@ed.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nedwards2017towards,\ntitle={Towards a Neural Statistician},\nauthor={Harrison Edwards and Amos Storkey},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJDBUF5le}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=HJDBUF5le", "pdf_size": 0, "rating": "6;8;8", "confidence": "4;4;2", "rating_avg": 7.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -0.5, "gs_citation": 552, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13612845677780497104&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "HJDdiT9gl", "title": "Generating Long and Diverse Responses with Neural Conversation Models", "track": "main", "status": "Reject", "tldr": "We generate high quality and informative open-domain conversation responses using seq2seq model with target-side attention and stochastic beam search with segment-by-segment reranking. ", "abstract": "Building general-purpose conversation agents is a very challenging task, but necessary on the road toward intelligent agents that can interact with humans in natural language. Neural conversation models -- purely data-driven systems trained end-to-end on dialogue corpora -- have shown great promise recently, yet they often produce short and generic responses. This work presents new training and decoding methods that improve the quality, coherence, and diversity of long responses generated using sequence-to-sequence models. Our approach adds self-attention to the decoder to maintain coherence in longer responses, and we propose a practical approach, called the glimpse-model, for scaling to large datasets. We introduce a stochastic beam-search algorithm with segment-by-segment reranking which lets us inject diversity earlier in the generation process. We trained on a combined data set of over 2.3B conversation messages mined from the web. In human evaluation studies, our method produces longer responses overall, with a higher proportion rated as acceptable and excellent as length increases, compared to baseline sequence-to-sequence models with explicit length-promotion. A back-off strategy produces better responses overall, in the full spectrum of lengths.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Louis Shao;Stephan Gouws;Denny Britz;Anna Goldie;Brian Strope;Ray Kurzweil", "authorids": "overmind@google.com;sgouws@google.com;dennybritz@google.com;agoldie@google.com;bps@google.com;raykurzweil@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nshao2017generating,\ntitle={Generating Long and Diverse Responses with Neural Conversation Models},\nauthor={Louis Shao and Stephan Gouws and Denny Britz and Anna Goldie and Brian Strope and Ray Kurzweil},\nyear={2017},\nurl={https://openreview.net/forum?id=HJDdiT9gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJDdiT9gl", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 16, "authors#_avg": 6, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7880653531604227149&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJF3iD9xe", "title": "Deep Learning with Sets and Point Clouds", "track": "main", "status": "Workshop", "tldr": "Parameter-sharing for permutation-equivariance and invariance with applications to point-cloud classification.", "abstract": "We introduce a simple permutation equivariant layer for deep learning with set structure. This type of layer, obtained by parameter-sharing, has a simple implementation and linear-time complexity in the size of each set. We use deep permutation-invariant networks to perform point-could classification and MNIST digit summation, where in both cases the output is invariant to permutations of the input. In a semi-supervised setting, where the goal is make predictions for each instance within a set, we demonstrate the usefulness of this type of layer in set-outlier detection as well as semi-supervised learning with clustering side-information.", "keywords": "Deep learning;Structured prediction;Computer vision;Supervised Learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Siamak Ravanbakhsh;Jeff Schneider;Barnabas Poczos", "authorids": "mravanba@cs.cmu.edu;bapoczos@cs.cmu.edu;jeff.schneider@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJF3iD9xe", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;1", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 19, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999999, "gs_citation": 224, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8683960035333446376&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HJGODLqgx", "title": "Recurrent Hidden Semi-Markov Model", "track": "main", "status": "Poster", "tldr": "We propose to incorporate the RNN to model the generative process in Hidden Semi-Markov Model for unsupervised segmentation and labeling.", "abstract": "Segmentation and labeling of high dimensional time series data has wide applications in behavior understanding and medical diagnosis. Due to the difficulty in obtaining the label information for high dimensional data, realizing this objective in an unsupervised way is highly desirable. Hidden Semi-Markov Model (HSMM) is a classical tool for this problem. However, existing HSMM and its variants has simple conditional assumptions of observations, thus the ability to capture the nonlinear and complex dynamics within segments is limited. To tackle this limitation, we propose to incorporate the Recurrent Neural Network (RNN) to model the generative process in HSMM, resulting the Recurrent HSMM (R-HSMM). To accelerate the inference while preserving accuracy, we designed a structure encoding function to mimic the exact inference. By generalizing the penalty method to distribution space, we are able to train the model and the encoding function simultaneously. Empirical results show that the proposed R-HSMM achieves the state-of-the-art performances on both synthetic and real-world datasets. ", "keywords": "Deep learning;Unsupervised Learning;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Hanjun Dai;Bo Dai;Yan-Ming Zhang;Shuang Li;Le Song", "authorids": "hanjundai@gatech.edu;bodai@gatech.edu;ymzhang@nlpr.ia.ac.cn;sli370@gatech.edu;lsong@cc.gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ndai2017recurrent,\ntitle={Recurrent Hidden Semi-Markov Model},\nauthor={Hanjun Dai and Bo Dai and Yan-Ming Zhang and Shuang Li and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJGODLqgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJGODLqgx", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15551244728618477829&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HJGwcKclx", "title": "Soft Weight-Sharing for Neural Network Compression", "track": "main", "status": "Poster", "tldr": "We use soft weight-sharing to compress neural network weights.", "abstract": "The success of deep learning in numerous application domains created the desire to run and train them on mobile devices. This however, conflicts with their computationally, memory and energy intense nature, leading to a growing interest in compression.\nRecent work by Han et al. (2016) propose a pipeline that involves retraining, pruning and quantization of neural network weights, obtaining state-of-the-art compression rates.\nIn this paper, we show that competitive compression rates can be achieved by using a version of \"soft weight-sharing\" (Nowlan & Hinton, 1991). Our method achieves both quantization and pruning in one simple (re-)training procedure. \nThis point of view also exposes the relation between compression and the minimum description length (MDL) principle. ", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Karen Ullrich;Edward Meeds;Max Welling", "authorids": "karen.ullrich@uva.nl;tmeeds@gmail.com;welling.max@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nullrich2017soft,\ntitle={Soft Weight-Sharing for Neural Network Compression},\nauthor={Karen Ullrich and Edward Meeds and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJGwcKclx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJGwcKclx", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 523, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4559451288287588807&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "HJIY0E9ge", "title": "A Simple yet Effective Method to Prune Dense Layers of Neural Networks", "track": "main", "status": "Reject", "tldr": "Pruning neural networks by adding output neurons with fully random targets and removing strongly correlated neurons.", "abstract": "Neural networks are usually over-parameterized with significant redundancy in the number of required neurons which results in unnecessary computation and memory usage at inference time. One common approach to address this issue is to prune these big networks by removing extra neurons and parameters while maintaining the accuracy. In this paper, we propose NoiseOut, a fully automated pruning algorithm based on the correlation between activations of neurons in the hidden layers. We prove that adding additional output neurons with entirely random targets results into a higher correlation between neurons which makes pruning by NoiseOut even more efficient. Finally, we test our method on various networks\nand datasets. These experiments exhibit high pruning rates while maintaining the accuracy of the original network.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Mohammad Babaeizadeh;Paris Smaragdis;Roy H. Campbell", "authorids": "mb2@illinois.edu.edu;paris@illinois.edu.edu;rhc@illinois.edu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbabaeizadeh2017a,\ntitle={A Simple yet Effective Method to Prune Dense Layers of Neural Networks},\nauthor={Mohammad Babaeizadeh and Paris Smaragdis and Roy H. Campbell},\nyear={2017},\nurl={https://openreview.net/forum?id=HJIY0E9ge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJIY0E9ge", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14382426003367108435&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HJKkY35le", "title": "Mode Regularized Generative Adversarial Networks", "track": "main", "status": "Poster", "tldr": "", "abstract": "Although Generative Adversarial Networks achieve state-of-the-art results on a\nvariety of generative tasks, they are regarded as highly unstable and prone to miss\nmodes. We argue that these bad behaviors of GANs are due to the very particular\nfunctional shape of the trained discriminators in high dimensional spaces, which\ncan easily make training stuck or push probability mass in the wrong direction,\ntowards that of higher concentration than that of the data generating distribution.\nWe introduce several ways of regularizing the objective, which can dramatically\nstabilize the training of GAN models. We also show that our regularizers can help\nthe fair distribution of probability mass across the modes of the data generating\ndistribution during the early phases of training, thus providing a unified solution\nto the missing modes problem.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Tong Che;Yanran Li;Athul Jacob;Yoshua Bengio;Wenjie Li", "authorids": "tong.che@umontreal.ca;csyli@comp.polyu.edu.hk;ap.jacob@umontreal.ca;yoshua.bengio@umontreal.ca;cswjli@comp.polyu.edu.hk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nche2017mode,\ntitle={Mode Regularized Generative Adversarial Networks},\nauthor={Tong Che and Yanran Li and Athul Jacob and Yoshua Bengio and Wenjie Li},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJKkY35le}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJKkY35le", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 27, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 744, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8235362476181771248&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HJOZBvcel", "title": "Learning to Discover Sparse Graphical Models", "track": "main", "status": "Workshop", "tldr": "Sparse graphical model structure estimators make restrictive assumptions. We show that empirical risk minimization can yield SOTA estimators for edge prediction across a wide range of graph structure distributions. ", "abstract": "We consider structure discovery of undirected graphical models from observational data. Inferring likely structures from few examples is a complex task often requiring the formulation of priors and sophisticated inference procedures. In the setting of Gaussian Graphical Models (GGMs) a popular estimator is a maximum likelihood objective with a penalization on the precision matrix. Adapting this estimator to capture domain-specific knowledge as priors or a new data likelihood requires great effort. In addition, structure recovery is an indirect consequence of the data-fit term. By contrast, it may be easier to generate training samples of data that arise from graphs with the desired structure properties. We propose here to leverage this latter source of information as training data to learn a function mapping from empirical covariance matrices to estimated graph structures. Learning this function brings two benefits: it implicitly models the desired structure or sparsity properties to form suitable priors, and it can be tailored to the specific problem of edge structure discovery, rather than maximizing data likelihood. We apply this framework to several real-world problems in structure discovery and show that it can be competitive to standard approaches such as graphical lasso, at a fraction of the execution speed. We use convolutional neural networks to parametrize our estimators due to the compositional structure of the problem. Experimentally, our learnable graph-discovery method trained on synthetic data generalizes well to different data: identifying relevant edges in real data, completely unknown at training time. We find that on genetics, brain imaging, and simulation data we obtain competitive(and generally superior) performance, compared with analytical methods. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eugene Belilovsky;Kyle Kastner;Gael Varoquaux;Matthew B. Blaschko", "authorids": "eugene.belilovsky@inria.fr;kyle.kastner@umontreal.ca;gael.varoquaux@inria.fr;matthew.blaschko@esat.kuleuven.be", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJOZBvcel", "pdf_size": 0, "rating": "5;6;7", "confidence": "2;3;3", "rating_avg": 6.0, "confidence_avg": 2.6666666666666665, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8693460984110187084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13 }, { "id": "HJPmdP9le", "title": "Efficient Summarization with Read-Again and Copy Mechanism", "track": "main", "status": "Reject", "tldr": "", "abstract": "Encoder-decoder models have been widely used to solve sequence to sequence prediction tasks. However current approaches suffer from two shortcomings. First, the encoders compute a representation of each word taking into account only the history of the words it has read so far, yielding suboptimal representations. Second, current models utilize large vocabularies in order to minimize the problem of unknown words, resulting in slow decoding times and large storage costs. In this paper we address both shortcomings. Towards this goal, we first introduce a simple mechanism that first reads the input sequence before committing to a representation of each word. Furthermore, we propose a simple copy mechanism that is able to exploit very small vocabularies and handle out-of-vocabulary words. We demonstrate the effectiveness of our approach on the Gigaword dataset and DUC competition outperforming the state-of-the-art.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenyuan Zeng;Wenjie Luo;Sanja Fidler;Raquel Urtasun", "authorids": "cengwy13@mails.tsinghua.edu.cn;wenjie@cs.toronto.edu;fidler@cs.toronto.edu;urtasun@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzeng2017efficient,\ntitle={Efficient Summarization with Read-Again and Copy Mechanism},\nauthor={Wenyuan Zeng and Wenjie Luo and Sanja Fidler and Raquel Urtasun},\nyear={2017},\nurl={https://openreview.net/forum?id=HJPmdP9le}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJPmdP9le", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16019903738050229893&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5 }, { "id": "HJSCGD9ex", "title": "Beyond Bilingual: Multi-sense Word Embeddings using Multilingual Context", "track": "main", "status": "Reject", "tldr": "Using multilingual context for learning multi-sense embeddings helps.", "abstract": "Word embeddings, which represent a word as a point in a vector space, have become ubiquitous to several NLP tasks. A recent line of work uses bilingual (two languages) corpora to learn a different vector for each sense of a word, by exploiting crosslingual signals to aid sense identification. We present a multi-view Bayesian non-parametric algorithm which improves multi-sense word embeddings by (a) using multilingual (i.e., more than two languages) corpora to significantly improve sense embeddings beyond what one achieves with bilingual information, and (b) uses a principled approach to learn a variable number of senses per word, in a data-driven manner. Ours is the first approach with the ability to leverage multilingual corpora efficiently for multi-sense representation learning. Experiments show that multilingual training significantly improves performance over monolingual and bilingual training, by allowing us to combine different parallel corpora to leverage multilingual context. Multilingual training yields com- parable performance to a state of the art monolingual model trained on five times more training data.", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Shyam Upadhyay;Kai-Wei Chang;James Zou;Matt Taddy;Adam Kalai", "authorids": "upadhya3@illinois.edu;kwchang@virginia.edu;jamesz@stanford.edu;taddy@microsoft.com;adum@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nupadhyay2017beyond,\ntitle={Beyond Bilingual: Multi-sense Word Embeddings using Multilingual Context},\nauthor={Shyam Upadhyay and Kai-Wei Chang and James Zou and Matt Taddy and Adam Kalai},\nyear={2017},\nurl={https://openreview.net/forum?id=HJSCGD9ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJSCGD9ex", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=692049706172822208&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "HJStZKqel", "title": "Lifelong Perceptual Programming By Example", "track": "main", "status": "Workshop", "tldr": "Combination of differentiable interpreters and neural networks for lifelong learning of a model composed of neural and source code functions", "abstract": "We introduce and develop solutions for the problem of Lifelong Perceptual Programming By Example (LPPBE). The problem is to induce a series of programs that require understanding perceptual data like images or text. LPPBE systems learn from weak supervision (input-output examples) and incrementally construct a shared library of components that grows and improves as more tasks are solved. Methodologically, we extend differentiable interpreters to operate on perceptual data and to share components across tasks. Empirically we show that this leads to a lifelong learning system that transfers knowledge to new tasks more effectively than baselines, and the performance on earlier tasks continues to improve even as the system learns on new, different tasks.", "keywords": "Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Alexander L. Gaunt;Marc Brockschmidt;Nate Kushman;Daniel Tarlow", "authorids": "t-algaun@microsoft.com;mabrocks@microsoft.com;nkushman@microsoft.com;dtarlow@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer5", "site": "https://openreview.net/forum?id=HJStZKqel", "pdf_size": 0, "rating": "2;4;8", "confidence": "5;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 19, "authors#_avg": 4, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13173370968164527482&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HJTXaw9gx", "title": "Recursive Regression with Neural Networks: Approximating the HJI PDE Solution", "track": "main", "status": "Workshop", "tldr": "A neural network that learns an approximation to a function by generating its own regression points", "abstract": "Most machine learning applications using neural networks seek to approximate some function g(x) by minimizing some cost criterion. In the simplest case, if one has access to pairs of the form (x, y) where y = g(x), the problem can be framed as a regression problem. Beyond this family of problems, we find many cases where the unavailability of data pairs makes this approach unfeasible. However, similar to what we find in the reinforcement learning literature, if we have some known properties of the function we are seeking to approximate, there is still hope to frame the problem as a regression problem. In this context, we present an algorithm that approximates the solution to a partial differential equation known as the Hamilton-Jacobi-Isaacs PDE and compare it to current state of the art tools. This PDE, which is found in the fields of control theory and robotics, is of particular importance in safety critical systems where guarantees of performance are a must.", "keywords": "Supervised Learning;Games;Theory", "primary_area": "", "supplementary_material": "", "author": "Vicen\u00e7 Rubies Royo;Claire Tomlin", "authorids": "vrubies@berkeley.edu;tomlin@berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJTXaw9gx", "pdf_size": 0, "rating": "3;5;7", "confidence": "5;1;3", "rating_avg": 5.0, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": -0.5, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16643988657173638160&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJTzHtqee", "title": "A Compare-Aggregate Model for Matching Text Sequences", "track": "main", "status": "Poster", "tldr": "A general \"compare-aggregate\" framework that performs word-level matching followed by aggregation using Convolutional Neural Networks", "abstract": "Many NLP tasks including machine comprehension, answer selection and text entailment require the comparison between sequences. Matching the important units between sequences is a key to solve these problems. In this paper, we present a general \"compare-aggregate\" framework that performs word-level matching followed by aggregation using Convolutional Neural Networks. We particularly focus on the different comparison functions we can use to match two vectors. We use four different datasets to evaluate the model. We find that some simple comparison functions based on element-wise operations can work better than standard neural network and neural tensor network. ", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Shuohang Wang;Jing Jiang", "authorids": "shwang.2014@phdis.smu.edu.sg;jingjiang@smu.edu.sg", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nwang2017a,\ntitle={A Compare-Aggregate Model for Matching Text Sequences},\nauthor={Shuohang Wang and Jing Jiang},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJTzHtqee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJTzHtqee", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;5;5", "rating_avg": 7.0, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 332, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3424529727297448315&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HJV1zP5xg", "title": "Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models", "track": "main", "status": "Reject", "tldr": "We introduce a novel, diversity promoting beam search algorithm that results in significantly improved diversity between decoded sequences as evaluated on multiple sequence generation tasks.", "abstract": "Neural sequence models are widely used to model time-series data. Equally ubiquitous is the usage of beam search (BS) as an approximate inference algorithm to decode output sequences from these models. BS explores the search space in a greedy left-right fashion retaining only the top B candidates. This tends to result in sequences that differ only slightly from each other. Producing lists of nearly identical sequences is not only computationally wasteful but also typically fails to capture the inherent ambiguity of complex AI tasks. To overcome this problem, we propose Diverse Beam Search (DBS), an alternative to BS that decodes a list of diverse outputs by optimizing a diversity-augmented objective. We observe that our method not only improved diversity but also finds better top 1 solutions by controlling for the exploration and exploitation of the search space. Moreover, these gains are achieved with minimal computational or memory overhead com- pared to beam search. To demonstrate the broad applicability of our method, we present results on image captioning, machine translation, conversation and visual question generation using both standard quantitative metrics and qualitative human studies. We find that our method consistently outperforms BS and previously proposed techniques for diverse decoding from neural sequence models.", "keywords": "Deep learning;Computer vision;Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Ashwin K Vijayakumar;Michael Cogswell;Ramprasaath R. Selvaraju;Qing Sun;Stefan Lee;David Crandall;Dhruv Batra", "authorids": "ashwinkv@vt.edu;cogswell@vt.edu;ram21@vt.edu;sunqing@vt.edu;steflee@vt.edu;djcran@indiana.edu;dbatra@vt.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nvijayakumar2017diverse,\ntitle={Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models},\nauthor={Ashwin K Vijayakumar and Michael Cogswell and Ramprasaath R. Selvaraju and Qing Sun and Stefan Lee and David Crandall and Dhruv Batra},\nyear={2017},\nurl={https://openreview.net/forum?id=HJV1zP5xg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJV1zP5xg", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 641, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13237207204383391681&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HJWHIKqgl", "title": "Generative Models and Model Criticism via Optimized Maximum Mean Discrepancy", "track": "main", "status": "Poster", "tldr": "A way to optimize the power of an MMD test, to use it for evaluating generative models and training GANs", "abstract": "We propose a method to optimize the representation and distinguishability of samples from two probability distributions, by maximizing the estimated power of a statistical test based on the maximum mean discrepancy (MMD). This optimized MMD is applied to the setting of unsupervised learning by generative adversarial networks (GAN), in which a model attempts to generate realistic samples, and a discriminator attempts to tell these apart from data samples. In this context, the MMD may be used in two roles: first, as a discriminator, either directly on the samples, or on features of the samples. Second, the MMD can be used to evaluate the performance of a generative model, by testing the model\u2019s samples against a reference data set. In the latter role, the optimized MMD is particularly helpful, as it gives an interpretable indication of how the model and data distributions differ, even in cases where individual model samples are not easily distinguished either by eye or by classifier.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Danica J. Sutherland;Hsiao-Yu Tung;Heiko Strathmann;Soumyajit De;Aaditya Ramdas;Alex Smola;Arthur Gretton", "authorids": "dsuth@cs.ubc.ca;htung@cs.cmu.edu;heiko.strathmann@gmail.com;soumyajitde.cse@gmail.com;aramdas@berkeley.edu;alex@smola.org;arthur.gretton@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nsutherland2017generative,\ntitle={Generative Models and Model Criticism via Optimized Maximum Mean Discrepancy},\nauthor={Danica J. Sutherland and Hsiao-Yu Tung and Heiko Strathmann and Soumyajit De and Aaditya Ramdas and Alex Smola and Arthur Gretton},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJWHIKqgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJWHIKqgl", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 252, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14039271793330969441&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "HJWzXsKxx", "title": "Training Long Short-Term Memory With Sparsified Stochastic Gradient Descent", "track": "main", "status": "Reject", "tldr": "A simple yet effective technique to induce considerable amount of sparsity in LSTM training", "abstract": "Prior work has demonstrated that exploiting the sparsity can dramatically improve the energy efficiency and shrink the memory footprint of Convolutional Neural Networks (CNNs).\nHowever, these sparsity-centric optimization techniques might be less effective for Long Short-Term Memory (LSTM) based Recurrent Neural Networks (RNNs), especially for the training phase, because of the significant structural difference between the neurons. To investigate if there is possible sparsity-centric optimization for training LSTM-based RNNs, we studied several applications and observed that there is potential sparsity in the gradients generated in the backward propagation. In this paper, we illustrate why the sparsity exists and propose a simple yet effective thresholding technique to induce further more sparsity during training an LSTM-based RNN training. Experiment results show that the proposed technique can increase the sparsity of linear gate gradients to higher than 80\\% without loss of performance, which makes more than 50\\% multiply-accumulate (MAC) operations redundant in an entire LSTM training process. These redudant MAC operations can be eliminated by hardware techniques to improve the energy efficiency and training speed of LSTM-based RNNs.", "keywords": "Optimization;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Maohua Zhu;Minsoo Rhu;Jason Clemons;Stephen W. Keckler;Yuan Xie", "authorids": "maohuazhu@ece.ucsb.edu;mrhu@nvidia.com;jclemons@nvidia.com;skeckler@nvidia.com;yuanxie@ece.ucsb.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhu2017training,\ntitle={Training Long Short-Term Memory With Sparsified Stochastic Gradient Descent},\nauthor={Maohua Zhu and Minsoo Rhu and Jason Clemons and Stephen W. Keckler and Yuan Xie},\nyear={2017},\nurl={https://openreview.net/forum?id=HJWzXsKxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJWzXsKxx", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9963311925855260890&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJcLcw9xg", "title": "The Preimage of Rectifier Network Activities", "track": "main", "status": "Reject", "tldr": "", "abstract": "The preimage of the activity at a certain level of a deep network is the set of inputs that result in the same node activity. For fully connected multi layer rectifier networks we demonstrate how to compute the preimages of activities at arbitrary levels from knowledge of the parameters in a deep rectifying network. If the preimage set of a certain activity in the network contains elements from more than one class it means that these classes are irreversibly mixed. This implies that preimage sets which are piecewise linear manifolds are building blocks for describing the input manifolds specific classes, i.e. all preimages should ideally be from the same class. We believe that the knowledge of how to compute preimages will be valuable in understanding the efficiency displayed by deep learning networks and could potentially be used in designing more efficient training algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Stefan Carlsson;Hossein Azizpour;Ali Razavian", "authorids": "stefanc@kth.se;azizpour@kth.se;razavian@kth.se", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncarlsson2017the,\ntitle={The Preimage of Rectifier Network Activities},\nauthor={Stefan Carlsson and Hossein Azizpour and Ali Razavian},\nyear={2017},\nurl={https://openreview.net/forum?id=HJcLcw9xg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJcLcw9xg", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;0;5", "rating_avg": 4.0, "confidence_avg": 3.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14441281031671004108&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HJeqWztlg", "title": "Hierarchical compositional feature learning", "track": "main", "status": "Reject", "tldr": "We show that max-product message passing with an appropriate schedule can be used to perform inference and learning in a directed multilayer generative model, thus recovering interpretable features from binary images.", "abstract": "We introduce the hierarchical compositional network (HCN), a directed generative model able to discover and disentangle, without supervision, the building blocks of a set of binary images. The building blocks are binary features defined hierarchically as a composition of some of the features in the layer immediately below, arranged in a particular manner. At a high level, HCN is similar to a sigmoid belief network with pooling. Inference and learning in HCN are very challenging and existing variational approximations do not work satisfactorily. A main contribution of this work is to show that both can be addressed using max-product message passing (MPMP) with a particular schedule (no EM required). Also, using MPMP as an inference engine for HCN makes new tasks simple: adding supervision information, classifying images, or performing inpainting all correspond to clamping some variables of the model to their known values and running MPMP on the rest. When used for classification, fast inference with HCN has exactly the same functional form as a convolutional neural network (CNN) with linear activations and binary weights. However, HCN\u2019s features are qualitatively very different.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Miguel Lazaro-Gredilla;Yi Liu;D. Scott Phoenix;Dileep George", "authorids": "miguel@vicarious.com;yi@vicarious.com;scott@vicarious.com;dileep@vicarious.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlazaro-gredilla2017hierarchical,\ntitle={Hierarchical compositional feature learning},\nauthor={Miguel Lazaro-Gredilla and Yi Liu and D. Scott Phoenix and Dileep George},\nyear={2017},\nurl={https://openreview.net/forum?id=HJeqWztlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJeqWztlg", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10738849848106052700&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HJgXCV9xx", "title": "Dialogue Learning With Human-in-the-Loop", "track": "main", "status": "Poster", "tldr": "we explore a reinforcement learning setting for dialogue where the bot improves its abilities using reward-based or textual feedback", "abstract": "An important aspect of developing conversational agents is to give a bot the ability to improve through communicating with humans and to learn from the mistakes that it makes. Most research has focused on learning from fixed training sets of labeled data rather than interacting with a dialogue partner in an online fashion. In this paper we explore this direction in a reinforcement learning setting where the bot improves its question-answering ability from feedback a teacher gives following its generated responses. We build a simulator that tests various aspects of such learning in a synthetic environment, and introduce models that work in this regime. Finally, real experiments with Mechanical Turk validate the approach.\n", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Jiwei Li;Alexander H. Miller;Sumit Chopra;Marc'Aurelio Ranzato;Jason Weston", "authorids": "jiwel@fb.com;ahm@fb.com;spchopra@fb.com;ranzato@fb.com;jase@fb.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nli2017dialogue,\ntitle={Dialogue Learning With Human-in-the-Loop},\nauthor={Jiwei Li and Alexander H. Miller and Sumit Chopra and Marc'Aurelio Ranzato and Jason Weston},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJgXCV9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=HJgXCV9xx", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 173, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14606408244559982008&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "HJhcg6Fxg", "title": "Binary Paragraph Vectors", "track": "main", "status": "Reject", "tldr": "Learning short codes for text documents with Binary Paragraph Vectors.", "abstract": "Recently Le & Mikolov described two log-linear models, called Paragraph Vector, that can be used to learn state-of-the-art distributed representations of documents. Inspired by this work, we present Binary Paragraph Vector models: simple neural networks that learn short binary codes for fast information retrieval. We show that binary paragraph vectors outperform autoencoder-based binary codes, despite using fewer bits. We also evaluate their precision in transfer learning settings, where binary codes are inferred for documents unrelated to the training corpus. Results from these experiments indicate that binary paragraph vectors can capture semantics relevant for various domain-specific documents. Finally, we present a model that simultaneously learns short binary codes and longer, real-valued representations. This model can be used to rapidly retrieve a short list of highly relevant documents from a large document collection.", "keywords": "Natural language processing;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Karol Grzegorczyk;Marcin Kurdziel", "authorids": "kgr@agh.edu.pl;kurdziel@agh.edu.pl", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngrzegorczyk2017binary,\ntitle={Binary Paragraph Vectors},\nauthor={Karol Grzegorczyk and Marcin Kurdziel},\nyear={2017},\nurl={https://openreview.net/forum?id=HJhcg6Fxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer5", "site": "https://openreview.net/forum?id=HJhcg6Fxg", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;3;2", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": -0.9449111825230683, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17786054003318476301&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "HJjiFK5gx", "title": "Neural Program Lattices", "track": "main", "status": "Poster", "tldr": "", "abstract": "We propose the Neural Program Lattice (NPL), a neural network that learns to perform complex tasks by composing low-level programs to express high-level programs. Our starting point is the recent work on Neural Programmer-Interpreters (NPI), which can only learn from strong supervision that contains the whole hierarchy of low-level and high-level programs. NPLs remove this limitation by providing the ability to learn from weak supervision consisting only of sequences of low-level operations. We demonstrate the capability of NPL to learn to perform long-hand addition and arrange blocks in a grid-world environment. Experiments show that it performs on par with NPI while using weak supervision in place of most of the strong supervision, thus indicating its ability to infer the high-level program structure from examples containing only the low-level operations.", "keywords": "Deep learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Chengtao Li;Daniel Tarlow;Alexander L. Gaunt;Marc Brockschmidt;Nate Kushman", "authorids": "ctli@mit.edu;dtarlow@microsoft.com;algaunt@microsoft.com;mabrocks@microsoft.com;nkushman@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nli2017neural,\ntitle={Neural Program Lattices},\nauthor={Chengtao Li and Daniel Tarlow and Alexander L. Gaunt and Marc Brockschmidt and Nate Kushman},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJjiFK5gx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJjiFK5gx", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;4;5", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 17, "authors#_avg": 5, "corr_rating_confidence": 0.5, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7235295132159473259&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HJlgm-B9lx", "title": "Learning to Understand: Incorporating Local Contexts with Global Attention for Sentiment Classification", "track": "main", "status": "Reject", "tldr": "a global-local mutually representation-learning attention model for sentiment analysis", "abstract": "Recurrent neural networks have shown their ability to construct sentence or paragraph representations. Variants such as LSTM overcome the problem of vanishing gradients to some degree, thus being able to model long-time dependency. Still, these recurrent based models lack the ability of capturing complex semantic compositions. To address this problem, we propose a model which can incorporate local contexts with the guide of global context attention. Both the local and global contexts are obtained through LSTM networks. The working procedure of this model is just like how we human beings read a text and then answer a related question. Empirical studies show that the proposed model can achieve state of the art on some benchmark datasets. Attention visualization also verifies our intuition. Meanwhile, this model does not need pretrained embeddings to get good results.", "keywords": "Natural language processing;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Zhigang Yuan;Yuting Hu;Yongfeng Huang", "authorids": "yuanzg14@mails.tsinghua.edu.cn;hu-yt12@mails.tsinghua.edu.cn;yfhuang@tsinghua.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyuan2017learning,\ntitle={Learning to Understand: Incorporating Local Contexts with Global Attention for Sentiment Classification},\nauthor={Zhigang Yuan and Yuting Hu and Yongfeng Huang},\nyear={2017},\nurl={https://openreview.net/forum?id=HJlgm-B9lx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJlgm-B9lx", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15310927880409720206&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HJpfMIFll", "title": "Geometry of Polysemy", "track": "main", "status": "Poster", "tldr": "", "abstract": "Vector representations of words have heralded a transformational approach to classical problems in NLP; the most popular example is word2vec. However, a single vector does not suffice to model the polysemous nature of many (frequent) words, i.e., words with multiple meanings. In this paper, we propose a three-fold approach for unsupervised polysemy modeling: (a) context representations, (b) sense induction and disambiguation and (c) lexeme (as a word and sense pair) representations. A key feature of our work is the finding that a sentence containing a target word is well represented by a low-rank subspace, instead of a point in a vector space. We then show that the subspaces associated with a particular sense of the target word tend to intersect over a line (one-dimensional subspace), which we use to disambiguate senses using a clustering algorithm that harnesses the Grassmannian geometry of the representations. The disambiguation algorithm, which we call $K$-Grassmeans, leads to a procedure to label the different senses of the target word in the corpus -- yielding lexeme vector representations, all in an unsupervised manner starting from a large (Wikipedia) corpus in English. Apart from several prototypical target (word,sense) examples and a host of empirical studies to intuit and justify the various geometric representations, we validate our algorithms on standard sense induction and disambiguation datasets and present new state-of-the-art results.", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Jiaqi Mu;Suma Bhat;Pramod Viswanath", "authorids": "jiaqimu2@illinois.edu;spbhat2@illinois.edu;pramodv@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmu2017geometry,\ntitle={Geometry of Polysemy},\nauthor={Jiaqi Mu and Suma Bhat and Pramod Viswanath},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HJpfMIFll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJpfMIFll", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7771521382187957938&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "HJrDIpiee", "title": "Investigating Recurrence and Eligibility Traces in Deep Q-Networks", "track": "main", "status": "Reject", "tldr": "Analyze the effects of using eligibility traces different optimizations in Deep Recurrent Q-Networks", "abstract": "Eligibility traces in reinforcement learning are used as a bias-variance trade-off and can often speed up training time by propagating knowledge back over time-steps in a single update. We investigate the use of eligibility traces in combination with recurrent networks in the Atari domain. We illustrate the benefits of both recurrent nets and eligibility traces in some Atari games, and highlight also the importance of the optimization used in the training.", "keywords": "Reinforcement Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Jean Harb;Doina Precup", "authorids": "jharb@cs.mcgill.ca;dprecup@cs.mcgill.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nharb2017investigating,\ntitle={Investigating Recurrence and Eligibility Traces in Deep Q-Networks},\nauthor={Jean Harb and Doina Precup},\nyear={2017},\nurl={https://openreview.net/forum?id=HJrDIpiee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer4", "site": "https://openreview.net/forum?id=HJrDIpiee", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;5;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 196, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14164403480633549438&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "HJtN5K9gx", "title": "Learning Disentangled Representations in Deep Generative Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep generative models provide a powerful and flexible means to learn complex distributions over data by incorporating neural networks into latent-variable models. Variational approaches to training such models introduce a probabilistic encoder that casts data, typically unsupervised, into an entangled and unstructured representation space. While unsupervised learning is often desirable, sometimes even necessary, when we lack prior knowledge about what to represent, being able to incorporate domain knowledge in characterising certain aspects of variation in the data can often help learn better disentangled representations. Here, we introduce a new formulation of semi-supervised learning in variational autoencoders that allows precisely this. It permits flexible specification of probabilistic encoders as directed graphical models via a stochastic computation graph, containing both continuous and discrete latent variables, with conditional distributions parametrised by neural networks. We demonstrate how the provision of structure, along with a few labelled examples indicating plausible values for some components of the latent space, can help quickly learn disentangled representations. We then evaluate its ability to do so, both qualitatively by exploring its generative capacity, and quantitatively by using the disentangled representation to perform classification, on a variety of models and datasets.", "keywords": "Semi-Supervised Learning;Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "N. Siddharth;Brooks Paige;Alban Desmaison;Jan-Willem van de Meent;Frank Wood;Noah D. Goodman;Pushmeet Kohli;Philip H.S. Torr", "authorids": "nsid@robots.ox.ac.uk;brooks@robots.ox.ac.uk;alban@robots.ox.ac.uk;j.vandemeent@northeastern.edu;fwood@robots.ox.ac.uk;ngoodman@stanford.edu;pkohli@microsoft.com;philip.torr@eng.ox.ac.uk", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nsiddharth2017learning,\ntitle={Learning Disentangled Representations in Deep Generative Models},\nauthor={N. Siddharth and Brooks Paige and Alban Desmaison and Jan-Willem van de Meent and Frank Wood and Noah D. Goodman and Pushmeet Kohli and Philip H.S. Torr},\nyear={2017},\nurl={https://openreview.net/forum?id=HJtN5K9gx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJtN5K9gx", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 8, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6527108189360934534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HJy_5Mcll", "title": "ENet: A Deep Neural Network Architecture for Real-Time Semantic Segmentation", "track": "main", "status": "Reject", "tldr": "", "abstract": "The ability to perform pixel-wise semantic segmentation in real-time is of paramount importance in practical mobile applications. Recent deep neural networks aimed at this task have the disadvantage of requiring a large number of floating point operations and have long run-times that hinder their usability. In this paper, we propose a novel deep neural network architecture named ENet (efficient neural network), created specifically for tasks requiring low latency operation. ENet is up to 18x faster, requires 75x less FLOPs, has 79x less parameters, and provides similar or better accuracy to existing models. \nWe have tested it on CamVid, Cityscapes and SUN datasets and report on comparisons with existing state-of-the-art methods, and the trade-offs between accuracy and processing time of a network. We present performance measurements of the proposed architecture on embedded systems and suggest possible software improvements that could make ENet even faster. \n", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Adam Paszke;Abhishek Chaurasia;Sangpil Kim;Eugenio Culurciello", "authorids": "a.paszke@students.mimuw.edu.pl;aabhish@purdue.edu;sangpilkim@purdue.edu;euge@purdue.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\npaszke2017enet,\ntitle={{EN}et: A Deep Neural Network Architecture for Real-Time Semantic Segmentation},\nauthor={Adam Paszke and Abhishek Chaurasia and Sangpil Kim and Eugenio Culurciello},\nyear={2017},\nurl={https://openreview.net/forum?id=HJy_5Mcll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJy_5Mcll", "pdf_size": 0, "rating": "3;4;4;5", "confidence": "4;4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 3112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10064611961321647849&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "Hk-mgcsgx", "title": "An Information Retrieval Approach for Finding Dependent Subspaces of Multiple Views", "track": "main", "status": "Reject", "tldr": "A novel method for seeking dependent subspaces across multiple views, preserving neighborhood relationships of data", "abstract": "Finding relationships between multiple views of data is essential both in exploratory analysis and as pre-processing for predictive tasks. A prominent approach is to apply variants of Canonical Correlation Analysis (CCA), a classical method seeking correlated components between views. The basic CCA is restricted to maximizing a simple dependency criterion, correlation, measured directly between data coordinates. We introduce a new method that finds dependent subspaces of views directly optimized for the data analysis task of neighbor retrieval between multiple views. We optimize mappings for each view such as linear transformations to maximize cross-view similarity between neighborhoods of data samples. The criterion arises directly from the well-defined retrieval task, detects nonlinear and local similarities, measures dependency of data relationships rather than only individual data coordinates, and is related to well understood measures of information retrieval quality. In experiments the proposed method outperforms alternatives in preserving cross-view neighborhood similarities, and yields insights into local dependencies between multiple views.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Ziyuan Lin;Jaakko Peltonen", "authorids": "ziyuan.lin@aalto.fi;jaakko.peltonen@uta.fi", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlin2017an,\ntitle={An Information Retrieval Approach for Finding Dependent Subspaces of Multiple Views},\nauthor={Ziyuan Lin and Jaakko Peltonen},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk-mgcsgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Hk-mgcsgx", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10801504769570596000&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "Hk1iOLcle", "title": "MS MARCO: A Human-Generated MAchine Reading COmprehension Dataset", "track": "main", "status": "Reject", "tldr": "A large scale human annotated data set for web-based reading comprehension along with baselines.", "abstract": "This paper presents our recent work on the design and development of a new, large scale dataset, which we name MS MARCO, for MAchine Reading COmprehension. This new dataset is aimed to overcome a number of well-known weaknesses of previous publicly available datasets for the same task of reading comprehension and question answering. In MS MARCO, all questions are sampled from real anonymized user queries. The context passages, from which answers in the dataset are derived, are extracted from real web documents using the most advanced version of the Bing search engine. The answers to the queries are human generated. Finally, a subset of these queries has multiple answers. We aim to release one million queries and the corresponding answers in the dataset, which, to the best of our knowledge, is the most comprehensive real-world dataset of its kind in both quantity and quality. We are currently releasing 100,000 queries with their corresponding answers to inspire work in reading comprehension and question answering along with gathering feedback from the research community.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tri Nguyen;Mir Rosenberg;Xia Song;Jianfeng Gao;Saurabh Tiwary;Rangan Majumder;Li Deng", "authorids": "trnguye@microsoft.com;miriamr@microsoft.com;xiaso@microsoft.com;jfgao@microsoft.com;satiwary@microsoft.com;ranganm@microsoft.com;deng@microsoft.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nnguyen2017ms,\ntitle={{MS} {MARCO}: A Human-Generated {MA}chine Reading {CO}mprehension Dataset},\nauthor={Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk1iOLcle}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hk1iOLcle", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;0;3", "rating_avg": 6.0, "confidence_avg": 2.0, "replies_avg": 12, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 1837, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5345294070960002557&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Hk1l9Xqxe", "title": "BIOACOUSTIC SEGMENTATION BY HIERARCHICAL DIRICHLET PROCESS HIDDEN MARKOV MODEL", "track": "main", "status": "Reject", "tldr": "", "abstract": "Understanding the communication between different animals by analysing their acoustic signals is an important topic in bioacoustics. It can be a powerful tool for the preservation of ecological diversity. We investigate probabilistic models to analyse signals issued from real-world bioacoustic sound scenes. We study a Bayesian non-parametric sequential models based on Hierarchical Dirichlet Process Hidden Markov Models (HDP-HMM). The model is able to infer hidden states, that are referred here as song units. However, using such a model raise one main issue: defining the number of hidden states the model has to learn. In bioacoustic problems we often do not know the number of song units (unlike in human speech recognition). Hence, we work with the Hierarchical Dirichlet Process (HDP)-HMM, which is a Bayesian non-parametric (BNP) model that offers a way to tackle this challenging problem. We focus our work on unsupervised learning from bioacoustic data. It consists in simultaneously finding the structure of hidden song units and automatically infer the unknown number of the hidden states to represent the data. Two real bioacoustic sound scene applications are investigated in this work: on whale and multi-species birds segmentation. The learning of these models is proceeded by using Markov-Chain Monte Carlo (MCMC) sampling techniques on Mel Frequency Cepstral Coefficients (MFCC) of audio signals. The results show an interesting song unit segmentation of the bioacoustic signals and open new insights for unsupervised analysis of such signals. This paper illustrates the potential of chunking non-human animal signals into structured parts. This can yield to a new species representation and help experts to better understand the behaviour of such species as Kershenbaum et al. (2014) wanted.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vincent Roger;Marius Bartcus;Faicel Chamroukhi;Herv\u00e9 Glotin", "authorids": "vincent-roger@etud.univ-tln.fr;marius.bartcus@gmail.com;faicel.chamroukhi@unicaen.fr;glotin@univ-tln.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nroger2017bioacoustic,\ntitle={{BIOACOUSTIC} {SEGMENTATION} {BY} {HIERARCHICAL} {DIRICHLET} {PROCESS} {HIDDEN} {MARKOV} {MODEL}},\nauthor={Vincent Roger and Marius Bartcus and Faicel Chamroukhi and Herv{\\'e} Glotin},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk1l9Xqxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hk1l9Xqxe", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;3;5", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15486849122582762558&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Hk3mPK5gg", "title": "Training Agent for First-Person Shooter Game with Actor-Critic Curriculum Learning", "track": "main", "status": "Poster", "tldr": "We propose a novel framework for training vision-based agent for First-Person Shooter (FPS) Game, Doom, using actor-critic model and curriculum training. ", "abstract": "In this paper, we propose a novel framework for training vision-based agent for First-Person Shooter (FPS) Game, in particular Doom.\nOur framework combines the state-of-the-art reinforcement learning approach (Asynchronous Advantage Actor-Critic (A3C) model) with curriculum learning. Our model is simple in design and only uses game states from the AI side, rather than using opponents' information. On a known map, our agent won 10 out of the 11 attended games and the champion of Track1 in ViZDoom AI Competition 2016 by a large margin, 35\\% higher score than the second place.", "keywords": "Reinforcement Learning;Applications;Games", "primary_area": "", "supplementary_material": "", "author": "Yuxin Wu;Yuandong Tian", "authorids": "ppwwyyxx@gmail.com;yuandong@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nwu2017training,\ntitle={Training Agent for First-Person Shooter Game with Actor-Critic Curriculum Learning},\nauthor={Yuxin Wu and Yuandong Tian},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk3mPK5gg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hk3mPK5gg", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -0.944911182523068, "gs_citation": 283, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7660660987455314612&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "Hk4_qw5xe", "title": "Towards Principled Methods for Training Generative Adversarial Networks", "track": "main", "status": "Oral", "tldr": "We introduce a theory about generative adversarial networks and their issues.", "abstract": "The goal of this paper is not to introduce a single algorithm or method, but to make theoretical steps towards fully understanding the training dynamics of gen- erative adversarial networks. In order to substantiate our theoretical analysis, we perform targeted experiments to verify our assumptions, illustrate our claims, and quantify the phenomena. This paper is divided into three sections. The first sec- tion introduces the problem at hand. The second section is dedicated to studying and proving rigorously the problems including instability and saturation that arize when training generative adversarial networks. The third section examines a prac- tical and theoretically grounded direction towards solving these problems, while introducing new tools to study them.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Martin Arjovsky;Leon Bottou", "authorids": "martinarjovsky@gmail.com;leonb@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\narjovsky2017towards,\ntitle={Towards Principled Methods for Training Generative Adversarial Networks},\nauthor={Martin Arjovsky and Leon Bottou},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk4_qw5xe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hk4_qw5xe", "pdf_size": 0, "rating": "7;8;10", "confidence": "4;3;5", "rating_avg": 8.333333333333334, "confidence_avg": 4.0, "replies_avg": 28, "authors#_avg": 2, "corr_rating_confidence": 0.6546536707079771, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R_emFfqRlHMJ:scholar.google.com/&scioq=Towards+Principled+Methods+for+Training+Generative+Adversarial+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "Hk4kQHceg", "title": "Multiplicative LSTM for sequence modelling", "track": "main", "status": "Workshop", "tldr": "Combines LSTM and multiplicative RNN architectures; achieves 1.19 bits/character on Hutter prize dataset with dynamic evaluation.", "abstract": "We introduce multiplicative LSTM (mLSTM), a novel recurrent neural network architecture for sequence modelling that combines the long short-term memory (LSTM) and multiplicative recurrent neural network architectures. mLSTM is characterised by its ability to have different recurrent transition functions for each possible input, which we argue makes it more expressive for autoregressive density estimation. We demonstrate empirically that mLSTM outperforms standard LSTM and its deep variants for a range of character level modelling tasks, and that this improvement increases with the complexity of the task. This model achieves a test error of 1.19 bits/character on the last 4 million characters of the Hutter prize dataset when combined with dynamic evaluation.", "keywords": "Deep learning;Natural language processing;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Ben Krause;Iain Murray;Steve Renals;Liang Lu", "authorids": "ben.krause@ed.ac.uk;i.murray@ed.ac.uk;s.renals@ed.ac.uk;llu@ttic.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hk4kQHceg", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 275, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15032973139552148568&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "Hk6a8N5xe", "title": "Classify or Select: Neural Architectures for Extractive Document Summarization", "track": "main", "status": "Reject", "tldr": "This paper presents two different neural architectures for extractive document summarization whose predictions are very interpretable, and show that they reach or outperform state-of-the-art supervised models.", "abstract": "We present two novel and contrasting Recurrent Neural Network (RNN) based architectures for extractive summarization of documents. The Classifier based architecture sequentially accepts or rejects each sentence in the original document order for its membership in the summary. The Selector architecture, on the other hand, is free to pick one sentence at a time in any arbitrary order to generate the extractive summary. \n\nOur models under both architectures jointly capture the notions of salience and redundancy of sentences. In addition, these models have the advantage of being very interpretable, since they allow visualization of their predictions broken up by abstract features such as information content, salience and redundancy. \n\nWe show that our models reach or outperform state-of-the-art supervised models on two different corpora. We also recommend the conditions under which one architecture is superior to the other based on experimental evidence.", "keywords": "Natural language processing;Supervised Learning;Applications;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Ramesh Nallapati;Bowen Zhou and Mingbo Ma", "authorids": ";", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnallapati2017classify,\ntitle={Classify or Select: Neural Architectures for Extractive Document Summarization},\nauthor={Ramesh Nallapati and Bowen Zhou and Mingbo Ma},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk6a8N5xe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hk6a8N5xe", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16655514127261179303&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Hk85q85ee", "title": "Symmetry-Breaking Convergence Analysis of Certain Two-layered Neural Networks with ReLU nonlinearity", "track": "main", "status": "Workshop", "tldr": "In this paper, we use dynamical system to analyze the nonlinear weight dynamics of two-layered bias-free ReLU networks.", "abstract": "In this paper, we use dynamical system to analyze the nonlinear weight dynamics of two-layered bias-free networks in the form of $g(x; w) = \\sum_{j=1}^K \\sigma(w_j \\cdot x)$, where $\\sigma(\\cdot)$ is ReLU nonlinearity. We assume that the input $x$ follow Gaussian distribution. The network is trained using gradient descent to mimic the output of a teacher network of the same size with fixed parameters $w*$ using $l_2$ loss. We first show that when $K = 1$, the nonlinear dynamics can be written in close form, and converges to $w*$ with at least $(1-\\epsilon)/2$ probability, if random weight initializations of proper standard derivation ($\\sim 1/\\sqrt{d}$) is used, verifying empirical practice. For networks with many ReLU nodes ($K \\ge 2$), we apply our close form dynamics and prove that when the teacher parameters $\\{w*_j\\}_{j=1}^K$ forms orthonormal bases, (1) a symmetric weight initialization yields a convergence to a saddle point and (2) a certain symmetry-breaking weight initialization yields global convergence to $w*$ without local minima. To our knowledge, this is the first proof that shows global convergence in nonlinear neural network without unrealistic assumptions on the independence of ReLU activations. In addition, we also give a concise gradient update formulation for a multilayer ReLU network when it follows a teacher of the same size with $l_2$ loss. Simulations verify our theoretical analysis.", "keywords": "Theory;Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Yuandong Tian", "authorids": "yuandong@fb.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hk85q85ee", "pdf_size": 0, "rating": "4;4;8", "confidence": "4;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 1, "corr_rating_confidence": 0.5, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3118385903390755218&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Hk8N3Sclg", "title": "Multi-Agent Cooperation and the Emergence of (Natural) Language", "track": "main", "status": "Oral", "tldr": "", "abstract": "The current mainstream approach to train natural language systems is to expose them to large amounts of text. This passive learning is problematic if we are in- terested in developing interactive machines, such as conversational agents. We propose a framework for language learning that relies on multi-agent communi- cation. We study this learning in the context of referential games. In these games, a sender and a receiver see a pair of images. The sender is told one of them is the target and is allowed to send a message to the receiver, while the receiver must rely on it to identify the target. Thus, the agents develop their own language interactively out of the need to communicate. We show that two networks with simple configurations are able to learn to coordinate in the referential game. We further explore whether the \u201cword meanings\u201d induced in the game reflect intuitive semantic properties of the objects depicted in the image, and we present a simple strategy for grounding the agents\u2019 code into natural language, a necessary step in developing machines that should eventually be able to communicate with humans.\n", "keywords": "Natural language processing;Reinforcement Learning;Games", "primary_area": "", "supplementary_material": "", "author": "Angeliki Lazaridou;Alexander Peysakhovich;Marco Baroni", "authorids": "angeliki.lazaridou@unitn.it;alexpeys@fb.com;marco.baroni@unitn.it", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlazaridou2017multiagent,\ntitle={Multi-Agent Cooperation and the Emergence of (Natural) Language},\nauthor={Angeliki Lazaridou and Alexander Peysakhovich and Marco Baroni},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk8N3Sclg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hk8N3Sclg", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;3;3", "rating_avg": 7.0, "confidence_avg": 3.0, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 566, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1931070702879918446&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "Hk8TGSKlg", "title": "Reasoning with Memory Augmented Neural Networks for Language Comprehension", "track": "main", "status": "Poster", "tldr": "", "abstract": "Hypothesis testing is an important cognitive process that supports human reasoning. In this paper, we introduce a computational hypothesis testing approach based on memory augmented neural networks. Our approach involves a hypothesis testing loop that reconsiders and progressively refines a previously formed hypothesis in order to generate new hypotheses to test. We apply the proposed approach to language comprehension task by using Neural Semantic Encoders (NSE). Our NSE models achieve the state-of-the-art results showing an absolute improvement of 1.2% to 2.6% accuracy over previous results obtained by single and ensemble systems on standard machine comprehension benchmarks such as the Children's Book Test (CBT) and Who-Did-What (WDW) news article datasets.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Tsendsuren Munkhdalai;Hong Yu", "authorids": "tsendsuren.munkhdalai@umassmed.edu;hong.yu@umassmed.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmunkhdalai2017reasoning,\ntitle={Reasoning with Memory Augmented Neural Networks for Language Comprehension},\nauthor={Tsendsuren Munkhdalai and Hong Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk8TGSKlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hk8TGSKlg", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;2", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5986114925221993904&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Hk8rlUqge", "title": "Joint Multimodal Learning with Deep Generative Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "We investigate deep generative models that can exchange multiple modalities bi-directionally, e.g., generating images from corresponding texts and vice versa. Recently, some studies handle multiple modalities on deep generative models, such as variational autoencoders (VAEs). However, these models typically assume that modalities are forced to have a conditioned relation, i.e., we can only generate modalities in one direction. To achieve our objective, we should extract a joint representation that captures high-level concepts among all modalities and through which we can exchange them bi-directionally. As described herein, we propose a joint multimodal variational autoencoder (JMVAE), in which all modalities are independently conditioned on joint representation. In other words, it models a joint distribution of modalities. Furthermore, to be able to generate missing modalities from the remaining modalities properly, we develop an additional method, JMVAE-kl, that is trained by reducing the divergence between JMVAE's encoder and prepared networks of respective modalities. Our experiments show that our proposed method can obtain appropriate joint representation from multiple modalities and that it can generate and reconstruct them more properly than conventional VAEs. We further demonstrate that JMVAE can generate multiple modalities bi-directionally.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Masahiro Suzuki;Kotaro Nakayama;Yutaka Matsuo", "authorids": "masa@weblab.t.u-tokyo.ac.jp;k-nakayama@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsuzuki2017joint,\ntitle={Joint Multimodal Learning with Deep Generative Models},\nauthor={Masahiro Suzuki and Kotaro Nakayama and Yutaka Matsuo},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk8rlUqge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hk8rlUqge", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;5;3", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 290, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12383618545710695238&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "Hk95PK9le", "title": "Deep Biaffine Attention for Neural Dependency Parsing", "track": "main", "status": "Poster", "tldr": "", "abstract": "This paper builds off recent work from Kiperwasser & Goldberg (2016) using neural attention in a simple graph-based dependency parser. We use a larger but more thoroughly regularized parser than other recent BiLSTM-based approaches, with\nbiaffine classifiers to predict arcs and labels. Our parser gets state of the art or near state of the art performance on standard treebanks for six different languages, achieving 95.7% UAS and 94.1% LAS on the most popular English PTB dataset. This makes it the highest-performing graph-based parser on this benchmark\u2014outperforming Kiperwasser & Goldberg (2016) by 1.8% and 2.2%\u2014and comparable to the highest performing transition-based parser (Kuncoro et al., 2016), which achieves 95.8% UAS and 94.6% LAS. We also show which hyperparameter choices had a significant effect on parsing accuracy, allowing us to achieve large gains over other graph-based approaches.\n", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Timothy Dozat;Christopher D. Manning", "authorids": "tdozat@stanford.edu;manning@stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ndozat2017deep,\ntitle={Deep Biaffine Attention for Neural Dependency Parsing},\nauthor={Timothy Dozat and Christopher D. Manning},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hk95PK9le}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=Hk95PK9le", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 21, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1520, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2220752205833525649&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "HkCjNI5ex", "title": "Regularizing Neural Networks by Penalizing Confident Output Distributions", "track": "main", "status": "Reject", "tldr": "We show that penalizing low entropy output distributions, which has been shown to improve exploration in reinforcement learning, acts as a strong regularizer in supervised learning.", "abstract": "We propose regularizing neural networks by penalizing low entropy output distributions. We show that penalizing low entropy output distributions, which has been shown to improve exploration in reinforcement learning, acts as a strong regularizer in supervised learning. We connect our confidence penalty to label smoothing through the direction of the KL divergence between networks output distribution and the uniform distribution. We exhaustively evaluate our proposed confidence penalty and label smoothing (uniform and unigram) on 6 common benchmarks: image classification (MNIST and Cifar-10), language modeling (Penn Treebank), machine translation (WMT'14 English-to-German), and speech recognition (TIMIT and WSJ). We find that both label smoothing and our confidence penalty improve state-of-the-art models across benchmarks without modifying existing hyper-parameters.\n", "keywords": "Deep learning;Supervised Learning;Speech;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Gabriel Pereyra;George Tucker;Jan Chorowski;Lukasz Kaiser;Geoffrey Hinton", "authorids": "pereyra@google.com;gjt@google.com;chorowski@google.com;lukaszkaiser@google.com;geoffhinton@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\npereyra2017regularizing,\ntitle={Regularizing Neural Networks by Penalizing Confident Output Distributions},\nauthor={Gabriel Pereyra and George Tucker and Jan Chorowski and Lukasz Kaiser and Geoffrey Hinton},\nyear={2017},\nurl={https://openreview.net/forum?id=HkCjNI5ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=HkCjNI5ex", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1375, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17169779076640319067&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "HkE0Nvqlg", "title": "Structured Attention Networks", "track": "main", "status": "Poster", "tldr": "Use a graphical model as a hidden layer to perform attention over latent structures", "abstract": "Attention networks have proven to be an effective approach for embedding categorical inference within a deep neural network. However, for many tasks we may want to model richer structural dependencies without abandoning end-to-end training. In this work, we experiment with incorporating richer structural distributions, encoded using graphical models, within deep networks. We show that these structured attention networks are simple extensions of the basic attention procedure, and that they allow for extending attention beyond the standard soft-selection approach, such as attending to partial segmentations or to subtrees. We experiment with two different classes of structured attention networks: a linear-chain conditional random field and a graph-based parsing model, and describe how these models can be practically implemented as neural network layers. Experiments show that this approach is effective for incorporating structural biases, and structured attention networks outperform baseline attention models on a variety of synthetic and real tasks: tree transduction, neural machine translation, question answering, and natural language inference. We further find that models trained in this way learn interesting unsupervised hidden representations that generalize simple attention.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yoon Kim;Carl Denton;Luong Hoang;Alexander M. Rush", "authorids": "yoonkim@seas.harvard.edu;carldenton@college.harvard.edu;lhoang@g.harvard.edu;srush@seas.harvard.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nkim2017structured,\ntitle={Structured Attention Networks},\nauthor={Yoon Kim and Carl Denton and Luong Hoang and Alexander M. Rush},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HkE0Nvqlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkE0Nvqlg", "pdf_size": 0, "rating": "8;8;8", "confidence": "5;3;4", "rating_avg": 8.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 674, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2293456029194846155&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "HkEI22jeg", "title": "Multilayer Recurrent Network Models of Primate Retinal Ganglion Cell Responses", "track": "main", "status": "Poster", "tldr": "", "abstract": "Developing accurate predictive models of sensory neurons is vital to understanding sensory processing and brain computations. The current standard approach to modeling neurons is to start with simple models and to incrementally add interpretable features. An alternative approach is to start with a more complex model that captures responses accurately, and then probe the fitted model structure to understand the neural computations. Here, we show that a multitask recurrent neural network (RNN) framework provides the flexibility necessary to model complex computations of neurons that cannot be captured by previous methods. Specifically, multilayer recurrent neural networks that share features across neurons outperform generalized linear models (GLMs) in predicting the spiking responses of parasol ganglion cells in the primate retina to natural images. The networks achieve good predictive performance given surprisingly small amounts of experimental training data. Additionally, we present a novel GLM-RNN hybrid model with separate spatial and temporal processing components which provides insights into the aspects of retinal processing better captured by the recurrent neural networks.", "keywords": "Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Eleanor Batty;Josh Merel;Nora Brackbill;Alexander Heitman;Alexander Sher;Alan Litke;E.J. Chichilnisky;Liam Paninski", "authorids": "erb2180@columbia.edu;jsmerel@gmail.com;nbrack@stanford.edu;alexkenheitmen@gmail.com;sashake3@uscs.edu;Alan.Litke@cern.ch;ej@stanford.edu;liam@stat.columbia.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nbatty2017multilayer,\ntitle={Multilayer Recurrent Network Models of Primate Retinal Ganglion Cell Responses},\nauthor={Eleanor Batty and Josh Merel and Nora Brackbill and Alexander Heitman and Alexander Sher and Alan Litke and E.J. Chichilnisky and Liam Paninski},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HkEI22jeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=HkEI22jeg", "pdf_size": 0, "rating": "4;7;8", "confidence": "4;4;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 8, "corr_rating_confidence": 0.6933752452815364, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9779378576054715402&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HkIQH7qel", "title": "Learning Recurrent Span Representations for Extractive Question Answering", "track": "main", "status": "Reject", "tldr": "We present a globally normalized architecture for extractive question answering that contains explicit representations of all possible answer spans.", "abstract": "The reading comprehension task, that asks questions about a given evidence document, is a central problem in natural language understanding. Recent formulations of this task have typically focused on answer selection from a set of candidates pre-defined manually or through the use of an external NLP pipeline. However, Rajpurkar et al. (2016) recently released the SQUAD dataset in which the answers can be arbitrary strings from the supplied text. In this paper, we focus on this answer extraction task, presenting a novel model architecture that efficiently builds fixed length representations of all spans in the evidence document with a recurrent network. We show that scoring explicit span representations significantly improves performance over other approaches that factor the prediction into separate predictions about words or start and end markers. Our approach improves upon the best published results of Wang & Jiang (2016) by 5% and decreases the error of Rajpurkar et al.\u2019s baseline by > 50%.", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Kenton Lee;Tom Kwiatkowksi;Ankur Parikh;Dipanjan Das", "authorids": "kentonl@cs.washington.edu;tomkwiat@google.com;aparikh@google.com;dipanjand@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017learning,\ntitle={Learning Recurrent Span Representations for Extractive Question Answering},\nauthor={Kenton Lee and Tom Kwiatkowksi and Ankur Parikh and Dipanjan Das},\nyear={2017},\nurl={https://openreview.net/forum?id=HkIQH7qel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HkIQH7qel", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;5;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3692283012960542431&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "HkJq1Ocxl", "title": "Programming With a Differentiable Forth Interpreter", "track": "main", "status": "Workshop", "tldr": "This paper presents the first neural implementation of an abstract machine for an actual language, allowing programmers to inject prior procedural knowledge into neural architectures in a straightforward manner.", "abstract": "There are families of neural networks that can learn to compute any function, provided sufficient training data. However, given that in practice training data is scarce for all but a small set of problems, a core question is how to incorporate prior knowledge into a model. Here we consider the case of prior procedural knowledge, such as knowing the overall recursive structure of a sequence transduction program or the fact that a program will likely use arithmetic operations on real numbers to solve a task. To this end we present a differentiable interpreter for the programming language Forth. Through a neural implementation of the dual stack machine that underlies Forth, programmers can write program sketches with slots that can be filled with behaviour trained from program input-output data. As the program interpreter is end-to-end differentiable, we can optimize this behaviour directly through gradient descent techniques on user specified objectives, and also integrate the program into any larger neural computation graph. We show empirically that our interpreter is able to effectively leverage different levels of prior program structure and learn complex transduction tasks such as sequence sorting or addition with substantially less data and better generalisation over problem sizes. In addition, we introduce neural program optimisations based on symbolic computation and parallel branching that lead to significant speed improvements. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matko Bo\u0161njak;Tim Rockt\u00e4schel;Jason Naradowsky;Sebastian Riedel", "authorids": "m.bosnjak@cs.ucl.ac.uk;t.rocktaschel@cs.ucl.ac.uk;j.narad@cs.ucl.ac.uk;s.riedel@cs.ucl.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HkJq1Ocxl", "pdf_size": 0, "rating": "5;6;7", "confidence": "2;4;2", "rating_avg": 6.0, "confidence_avg": 2.6666666666666665, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=414036379914758548&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "HkLXCE9lx", "title": "RL^2: Fast Reinforcement Learning via Slow Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We propose to learn a \u201cfast\u201d reinforcement learning algorithm using standard, off-the-shelf (\u201cslow\u201d) reinforcement learning algorithms, where the \u201cfast\u201d version is represented as an RNN, and fast RL happens inside its activations.", "abstract": "Deep reinforcement learning (deep RL) has been successful in learning sophisticated behaviors automatically; however, the learning process requires a huge number of trials. In contrast, animals can learn new tasks in just a few trials, benefiting from their prior knowledge about the world. This paper seeks to bridge this gap. Rather than designing a \u201cfast\u201d reinforcement learning algorithm, we propose to represent it as a recurrent neural network (RNN) and learn it from data. In our proposed method, RL^2, the algorithm is encoded in the weights of the RNN, which are learned slowly through a general-purpose (\u201cslow\u201d) RL algorithm. The RNN receives all information a typical RL algorithm would receive, including observations, actions, rewards, and termination flags; and it retains its state across episodes in a given Markov Decision Process (MDP). The activations of the RNN store the state of the \u201cfast\u201d RL algorithm on the current (previously unseen) MDP. We evaluate RL^2 experimentally on both small-scale and large-scale problems. On the small-scale side, we train it to solve randomly generated multi-arm bandit problems and finite MDPs. After RL^2 is trained, its performance on new MDPs is close to human-designed algorithms with optimality guarantees. On the large-scale side, we test RL^2 on a vision-based navigation task and show that it scales up to high-dimensional problems.", "keywords": "Reinforcement Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Yan Duan;John Schulman;Xi Chen;Peter L. Bartlett;Ilya Sutskever;Pieter Abbeel", "authorids": "rocky@openai.com;joschu@openai.com;peter@openai.com;peter@berkeley.edu;ilyasu@openai.com;pieter@openai.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nduan2017rl,\ntitle={{RL}{\\textasciicircum}2: Fast Reinforcement Learning via Slow Reinforcement Learning},\nauthor={Yan Duan and John Schulman and Xi Chen and Peter L. Bartlett and Ilya Sutskever and Pieter Abbeel},\nyear={2017},\nurl={https://openreview.net/forum?id=HkLXCE9lx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HkLXCE9lx", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 15, "authors#_avg": 6, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 1284, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13749904130207868626&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "HkNEuToge", "title": "Energy-Based Spherical Sparse Coding", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we explore an efficient variant of convolutional sparse coding with unit norm code vectors and reconstructions are evaluated using an inner product (cosine distance). To use these codes for discriminative classification, we describe a model we term Energy-Based Spherical Sparse Coding (EB-SSC) in which the hypothesized class label introduces a learned linear bias into the coding step. We evaluate and visualize performance of stacking this encoder to make a deep layered model for image classification.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bailey Kong;Charless C. Fowlkes", "authorids": "bhkong@ics.uci.edu;fowlkes@ics.uci.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkong2017energybased,\ntitle={Energy-Based Spherical Sparse Coding},\nauthor={Bailey Kong and Charless C. Fowlkes},\nyear={2017},\nurl={https://openreview.net/forum?id=HkNEuToge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkNEuToge", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1343976313216911519&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "HkNKFiGex", "title": "Neural Photo Editing with Introspective Adversarial Networks", "track": "main", "status": "Poster", "tldr": "An interface for editing photos using generative image models.", "abstract": "The increasingly photorealistic sample quality of generative image models suggests their feasibility in applications beyond image generation. We present the Neural Photo Editor, an interface that leverages the power of generative neural networks to make large, semantically coherent changes to existing images. To tackle the challenge of achieving accurate reconstructions without loss of feature quality, we introduce the Introspective Adversarial Network, \na novel hybridization of the VAE and GAN. Our model efficiently captures long-range dependencies through use of a computational block based on weight-shared dilated convolutions, and improves generalization performance with Orthogonal Regularization, a novel weight regularization method. We validate our contributions on CelebA, SVHN, and CIFAR-100, and produce samples and reconstructions with high visual fidelity.", "keywords": "Computer vision;Unsupervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Andrew Brock;Theodore Lim;J.M. Ritchie;Nick Weston", "authorids": "ajb5@hw.ac.uk;t.lim@hw.ac.uk;j.m.ritchie@hw.ac.uk;Nick.Weston@renishaw.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbrock2017neural,\ntitle={Neural Photo Editing with Introspective Adversarial Networks},\nauthor={Andrew Brock and Theodore Lim and J.M. Ritchie and Nick Weston},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HkNKFiGex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HkNKFiGex", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 17, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 593, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13506818224034936115&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "HkNRsU5ge", "title": "Sigma Delta Quantized Networks", "track": "main", "status": "Poster", "tldr": "A deep neural network that saves computation on temporal data by using neurons that only communicate their changes in activation", "abstract": "Deep neural networks can be obscenely wasteful. When processing video, a convolutional network expends a fixed amount of computation for each frame with no regard to the similarity between neighbouring frames. As a result, it ends up repeatedly doing very similar computations. To put an end to such waste, we introduce Sigma-Delta networks. With each new input, each layer in this network sends a discretized form of its change in activation to the next layer. Thus the amount of computation that the network does scales with the amount of change in the input and layer activations, rather than the size of the network. We introduce an optimization method for converting any pre-trained deep network into an optimally efficient Sigma-Delta network, and show that our algorithm, if run on the appropriate hardware, could cut at least an order of magnitude from the computational cost of processing video data.\n", "keywords": "Computer vision;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Peter O'Connor;Max Welling", "authorids": "peter.ed.oconnor@gmail.com;max.welling@uva.nl", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\no'connor2017sigma,\ntitle={Sigma Delta Quantized Networks},\nauthor={Peter O'Connor and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HkNRsU5ge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkNRsU5ge", "pdf_size": 0, "rating": "6;8;8", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=900346696179721483&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HkSOlP9lg", "title": "Recurrent Inference Machines for Solving Inverse Problems", "track": "main", "status": "Reject", "tldr": "", "abstract": "Inverse problems are typically solved by first defining a model and then choosing an inference procedure. With this separation of modeling from inference, inverse problems can be framed in a modular way. For example, variational inference can be applied to a broad class of models. The modularity, however, typically goes away after model parameters have been trained under a chosen inference procedure. During training, model and inference often interact in a way that the model parameters will ultimately be adapted to the chosen inference procedure, posing the two components inseparable after training. But if model and inference become inseperable after training, why separate them in the first place?\n\nWe propose a novel learning framework which abandons the dichotomy between model and inference. Instead, we introduce Recurrent Inference Machines (RIM), a class of recurrent neural networks (RNN) that directly learn to solve inverse problems.\n\nWe demonstrate the effectiveness of RIMs in experiments on various image reconstruction tasks. We show empirically that RIMs exhibit the desirable convergence behavior of classical inference procedures, and that they can outperform state-of- the-art methods when trained on specialized inference tasks.\n\nOur approach bridges the gap between inverse problems and deep learning, providing a framework for fast progression in the field of inverse problems.", "keywords": "Optimization;Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Patrick Putzky;Max Welling", "authorids": "patrick.putzky@gmail.com;welling.max@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nputzky2017recurrent,\ntitle={Recurrent Inference Machines for Solving Inverse Problems},\nauthor={Patrick Putzky and Max Welling},\nyear={2017},\nurl={https://openreview.net/forum?id=HkSOlP9lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkSOlP9lg", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -0.944911182523068, "gs_citation": 144, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2575144744672605748&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "HkYhZDqxg", "title": "Tree-structured decoding with doubly-recurrent neural networks", "track": "main", "status": "Poster", "tldr": "A new architecture for generating tree-structured objects from encoded representations, which models separately the width and depth recurrences across the tree and predicts both content and topology.", "abstract": "We propose a neural network architecture for generating tree-structured objects from encoded representations. The core of the method is a doubly-recurrent neural network that models separately the width and depth recurrences across the tree, and combines them inside each cell to generate an output. The topology of the tree is explicitly modeled, allowing the network to predict both content and topology of the tree when decoding. That is, given only an encoded vector representation, the network is able to simultaneously generate a tree from it and predict labels for the nodes. We test this architecture in an encoder-decoder framework, where we train a network to encode a sentence as a vector, and then generate a tree structure from it. The experimental results show the effectiveness of this architecture at recovering latent tree structure in sequences and at mapping sentences to simple functional programs.", "keywords": "Natural language processing;Supervised Learning;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "David Alvarez-Melis;Tommi S. Jaakkola", "authorids": "dalvmel@mit.edu;tommi@csail.mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nalvarez-melis2017treestructured,\ntitle={Tree-structured decoding with doubly-recurrent neural networks},\nauthor={David Alvarez-Melis and Tommi S. Jaakkola},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HkYhZDqxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=HkYhZDqxg", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 22, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14877003498075626479&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "HkcdHtqlx", "title": "Gated-Attention Readers for Text Comprehension", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper we study the problem of answering cloze-style questions over documents. Our model, the Gated-Attention (GA) Reader, integrates a multi-hop architecture with a novel attention mechanism, which is based on multiplicative interactions between the query embedding and the intermediate states of a recurrent neural network document reader. This enables the reader to build query-specific representations of tokens in the document for accurate answer selection. The GA Reader obtains state-of-the-art results on three benchmarks for this task--the CNN \\& Daily Mail news stories and the Who Did What dataset. The effectiveness of multiplicative interaction is demonstrated by an ablation study, and by comparing to alternative compositional operators for implementing the gated-attention. ", "keywords": "Natural language processing;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Bhuwan Dhingra;Hanxiao Liu;Zhilin Yang;William W. Cohen;Ruslan Salakhutdinov", "authorids": "bdhingra@cs.cmu.edu;hanxiaol@cs.cmu.edu;zhiliny@cs.cmu.edu;wcohen@cs.cmu.edu;rsalakhu@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndhingra2017gatedattention,\ntitle={Gated-Attention Readers for Text Comprehension},\nauthor={Bhuwan Dhingra and Hanxiao Liu and Zhilin Yang and William W. Cohen and Ruslan Salakhutdinov},\nyear={2017},\nurl={https://openreview.net/forum?id=HkcdHtqlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkcdHtqlx", "pdf_size": 0, "rating": "6;6;7", "confidence": "0;3;3", "rating_avg": 6.333333333333333, "confidence_avg": 2.0, "replies_avg": 26, "authors#_avg": 5, "corr_rating_confidence": 0.49999999999999994, "gs_citation": 478, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3290403078094631843&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "Hkg4TI9xl", "title": "A Baseline for Detecting Misclassified and Out-of-Distribution Examples in Neural Networks", "track": "main", "status": "Poster", "tldr": "Methods to Detect When a Network Is Wrong", "abstract": "We consider the two related problems of detecting if an example is misclassified or out-of-distribution. We present a simple baseline that utilizes probabilities from softmax distributions. Correctly classified examples tend to have greater maximum softmax probabilities than erroneously classified and out-of-distribution examples, allowing for their detection. We assess performance by defining several tasks in computer vision, natural language processing, and automatic speech recognition, showing the effectiveness of this baseline across all. We then show the baseline can sometimes be surpassed, demonstrating the room for future research on these underexplored detection tasks.", "keywords": "Computer vision", "primary_area": "", "supplementary_material": "", "author": "Dan Hendrycks;Kevin Gimpel", "authorids": "dan@ttic.edu;kgimpel@ttic.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhendrycks2017a,\ntitle={A Baseline for Detecting Misclassified and Out-of-Distribution Examples in Neural Networks},\nauthor={Dan Hendrycks and Kevin Gimpel},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hkg4TI9xl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hkg4TI9xl", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 4314, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14505244835813531476&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "Hkg8bDqee", "title": "Introspection:Accelerating Neural Network Training By Learning Weight Evolution", "track": "main", "status": "Poster", "tldr": "Acceleration of training by performing weight updates, using knowledge obtained from training other neural networks.", "abstract": "Neural Networks are function approximators that have achieved state-of-the-art accuracy in numerous machine learning tasks. In spite of their great success in terms of accuracy, their large training time makes it difficult to use them for various tasks. In this paper, we explore the idea of learning weight evolution pattern from a simple network for accelerating training of novel neural networks.\n\nWe use a neural network to learn the training pattern from MNIST classification and utilize it to accelerate training of neural networks used for CIFAR-10 and ImageNet classification. Our method has a low memory footprint and is computationally efficient. This method can also be used with other optimizers to give faster convergence. The results indicate a general trend in the weight evolution during training of neural networks.", "keywords": "Computer vision;Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Abhishek Sinha;Aahitagni Mukherjee;Mausoom Sarkar;Balaji Krishnamurthy", "authorids": "abhishek.sinha94@gmail.com;ahitagnimukherjeeam@gmail.com;msarkar@adobe.com;kbalaji@adobe.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsinha2017introspectionaccelerating,\ntitle={Introspection:Accelerating Neural Network Training By Learning Weight Evolution},\nauthor={Abhishek Sinha and Aahitagni Mukherjee and Mausoom Sarkar and Balaji Krishnamurthy},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hkg8bDqee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hkg8bDqee", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;5;5", "rating_avg": 8.0, "confidence_avg": 4.666666666666667, "replies_avg": 23, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11125464888439003456&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "HkljfjFee", "title": "Support Regularized Sparse Coding and Its Fast Encoder", "track": "main", "status": "Poster", "tldr": "We present Support Regularized Sparse Coding (SRSC) to improve the regular sparse coding, and propose a feed-forward neural network termed Deep Support Regularized Sparse Coding (Deep-SRSC) as its fast encoder.", "abstract": "Sparse coding represents a signal by a linear combination of only a few atoms of a learned over-complete dictionary. While sparse coding exhibits compelling performance for various machine learning tasks, the process of obtaining sparse code with fixed dictionary is independent for each data point without considering the geometric information and manifold structure of the entire data. We propose Support Regularized Sparse Coding (SRSC) which produces sparse codes that account for the manifold structure of the data by encouraging nearby data in the manifold to choose similar dictionary atoms. In this way, the obtained support regularized sparse codes capture the locally linear structure of the data manifold and enjoy robustness to data noise. We present the optimization algorithm of SRSC with theoretical guarantee for the optimization over the sparse codes. We also propose a feed-forward neural network termed Deep Support Regularized Sparse Coding (Deep-SRSC) as a fast encoder to approximate the sparse codes generated by SRSC. Extensive experimental results demonstrate the effectiveness of SRSC and Deep-SRSC.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yingzhen Yang;Jiahui Yu;Pushmeet Kohli;Jianchao Yang;Thomas S. Huang", "authorids": "superyyzg@gmail.com;jyu79@illinois.edu;pkohli@microsoft.com;jianchao.yang@snapchat.com;t-huang1@illinois.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyang2017support,\ntitle={Support Regularized Sparse Coding and Its Fast Encoder},\nauthor={Yingzhen Yang and Jiahui Yu and Pushmeet Kohli and Jianchao Yang and Thomas S. Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HkljfjFee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkljfjFee", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15754816049261448834&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "HkpLeH9el", "title": "Neural Functional Programming", "track": "main", "status": "Workshop", "tldr": "A differentiable functional programming language for learning programs from input-output examples.", "abstract": "We discuss a range of modeling choices that arise when constructing an end-to-end differentiable programming language suitable for learning programs from input-output examples. Taking cues from programming languages research, we study the effect of memory allocation schemes, immutable data, type systems, and built-in control-flow structures on the success rate of learning algorithms. We build a range of models leading up to a simple differentiable functional programming language. Our empirical evaluation shows that this language allows to learn far more programs than existing baselines.", "keywords": "Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "John K. Feser;Marc Brockschmidt;Alexander L. Gaunt;Daniel Tarlow", "authorids": "feser@csail.mit.edu;mabrocks@microsoft.com;t-algaun@microsoft.com;dtarlow@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer5;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkpLeH9el", "pdf_size": 0, "rating": "4;5;5;6;7", "confidence": "3;2;3;3;2", "rating_avg": 5.4, "confidence_avg": 2.6, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": -0.4803844614152616, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14420512703279490945&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HkpbnH9lx", "title": "Density estimation using Real NVP", "track": "main", "status": "Poster", "tldr": "Efficient invertible neural networks for density estimation and generation", "abstract": "Unsupervised learning of probabilistic models is a central yet challenging problem in machine learning. Specifically, designing models with tractable learning, sampling, inference and evaluation is crucial in solving this task. We extend the space of such models using real-valued non-volume preserving (real NVP) transformations, a set of powerful invertible and learnable transformations, resulting in an unsupervised learning algorithm with exact log-likelihood computation, exact sampling, exact inference of latent variables, and an interpretable latent space. We demonstrate its ability to model natural images on four datasets through sampling, log-likelihood evaluation and latent variable manipulations.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Laurent Dinh;Jascha Sohl-Dickstein;Samy Bengio", "authorids": "dinh.laurent@gmail.com;jaschasd@google.com;bengio@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndinh2017density,\ntitle={Density estimation using Real {NVP}},\nauthor={Laurent Dinh and Jascha Sohl-Dickstein and Samy Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HkpbnH9lx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=HkpbnH9lx", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;4", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 4547, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6875639475985157714&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14 }, { "id": "HksioDcxl", "title": "Joint Training of Ratings and Reviews with Recurrent Recommender Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Accurate modeling of ratings and text reviews is at the core of successful recommender systems. While neural networks have been remarkably successful in modeling images and natural language, they have been largely unexplored in recommender system research. In this paper, we provide a neural network model that combines ratings, reviews, and temporal patterns to learn highly accurate recommendations. We co-train for prediction on both numerical ratings and natural language reviews, as well as using a recurrent architecture to capture the dynamic components of users' and items' states. We demonstrate that incorporating text reviews and temporal dynamic gives state-of-the-art results over the IMDb dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chao-Yuan Wu;Amr Ahmed;Alex Beutel;Alexander J. Smola", "authorids": "cywu@cs.utexas.edu;amra@google.com;alexbeutel@google.com;alex@smola.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwu2017joint,\ntitle={Joint Training of Ratings and Reviews with Recurrent Recommender Networks},\nauthor={Chao-Yuan Wu and Amr Ahmed and Alex Beutel and Alexander J. Smola},\nyear={2017},\nurl={https://openreview.net/forum?id=HksioDcxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HksioDcxl", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 17, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "Hku9NK5lx", "title": "Training Compressed Fully-Connected Networks with a Density-Diversity Penalty", "track": "main", "status": "Poster", "tldr": "We propose a new ''density-diversity penalty'' to fully-connected layers to get significantly high sparsity and low diversity trained matrices, while keeping the performance the same.", "abstract": "Deep models have achieved great success on a variety of challenging tasks. How- ever, the models that achieve great performance often have an enormous number of parameters, leading to correspondingly great demands on both computational and memory resources, especially for fully-connected layers. In this work, we propose a new \u201cdensity-diversity penalty\u201d regularizer that can be applied to fully-connected layers of neural networks during training. We show that using this regularizer results in significantly fewer parameters (i.e., high sparsity), and also significantly fewer distinct values (i.e., low diversity), so that the trained weight matrices can be highly compressed without any appreciable loss in performance. The resulting trained models can hence reside on computational platforms (e.g., portables, Internet-of-Things devices) where it otherwise would be prohibitive.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Shengjie Wang;Haoran Cai;Jeff Bilmes;William Noble", "authorids": "wangsj@cs.washington.edu;haoran@uw.edu;bilmes@uw.edu;william-noble@u.washington.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwang2017training,\ntitle={Training Compressed Fully-Connected Networks with a Density-Diversity Penalty},\nauthor={Shengjie Wang and Haoran Cai and Jeff Bilmes and William Noble},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hku9NK5lx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hku9NK5lx", "pdf_size": 0, "rating": "6;6;9", "confidence": "4;2;4", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 22, "authors#_avg": 4, "corr_rating_confidence": 0.5000000000000001, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10791124894320544245&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HkuVu3ige", "title": "On orthogonality and learning recurrent networks with long term dependencies", "track": "main", "status": "Reject", "tldr": "While orthogonal matrices improve neural network stability during training, deviating from orthogonality may improve model convergence speed and performance.", "abstract": "It is well known that it is challenging to train deep neural networks and recurrent neural networks for tasks that exhibit long term dependencies. The vanishing or exploding gradient problem is a well known issue associated with these challenges. One approach to addressing vanishing and exploding gradients is to use either soft or hard constraints on weight matrices so as to encourage or enforce orthogonality. Orthogonal matrices preserve gradient norm during backpropagation and can therefore be a desirable property; however, we find that hard constraints on orthogonality can negatively affect the speed of convergence and model performance. This paper explores the issues of optimization convergence, speed and gradient stability using a variety of different methods for encouraging or enforcing orthogonality. In particular we propose a weight matrix factorization and parameterization strategy through which we we can bound matrix norms and therein control the degree of expansivity induced during backpropagation.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Eugene Vorontsov;Chiheb Trabelsi;Samuel Kadoury;Chris Pal", "authorids": "eugene.vorontsov@gmail.com;chiheb.trabelsi@polymtl.ca;samuel.kadoury@polymtl.ca;christopher.pal@polymtl.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nvorontsov2017on,\ntitle={On orthogonality and learning recurrent networks with long term dependencies},\nauthor={Eugene Vorontsov and Chiheb Trabelsi and Samuel Kadoury and Chris Pal},\nyear={2017},\nurl={https://openreview.net/forum?id=HkuVu3ige}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkuVu3ige", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;5;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 281, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12745127873332927893&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HkvS3Mqxe", "title": "Coarse Pruning of Convolutional Neural Networks with Random Masks", "track": "main", "status": "Reject", "tldr": "This work has proposed a new pruning strategy for CNN. Further, feature map and kernel pruning granularities are proposed for good pruning ratios and simple sparse representation.", "abstract": "The learning capability of a neural network improves with increasing depth at\nhigher computational costs. Wider layers with dense kernel connectivity patterns\nfurther increase this cost and may hinder real-time inference. We propose feature\nmap and kernel level pruning for reducing the computational complexity of\na deep convolutional neural network. Pruning feature maps reduces the width\nof a layer and hence does not need any sparse representation. Further, kernel\npruning changes the dense connectivity pattern into a sparse one. Due to coarse\nnature, these pruning granularities can be exploited by GPUs and VLSI based\nimplementations. We propose a simple strategy to choose the least adversarial\npruning masks. The proposed approach is generic and can select good pruning\nmasks for feature map, kernel and intra-kernel pruning. The pruning masks are\ngenerated randomly, and the best performing one is selected using the evaluation\nset. The sufficient number of random pruning masks to try depends on the pruning\nratio, and is around 100 when 40% complexity reduction is needed. The pruned\nnetwork is retrained to compensate for the loss in accuracy. We have extensively\nevaluated the proposed approach with the CIFAR-10, SVHN and MNIST datasets.\nExperiments with the CIFAR-10 dataset show that more than 85% sparsity can be\ninduced in the convolution layers with less than 1% increase in the misclassification\nrate of the baseline network.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sajid Anwar;Wonyong Sung", "authorids": "sajid@dsp.snu.ac.kr;wysung@snu.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nanwar2017coarse,\ntitle={Coarse Pruning of Convolutional Neural Networks with Random Masks},\nauthor={Sajid Anwar and Wonyong Sung},\nyear={2017},\nurl={https://openreview.net/forum?id=HkvS3Mqxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=HkvS3Mqxe", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8021019144930790693&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HkwoSDPgg", "title": "Semi-supervised Knowledge Transfer for Deep Learning from Private Training Data", "track": "main", "status": "Oral", "tldr": "Semi-supervised learning of a privacy-preserving student model with GANs by knowledge transfer from an ensemble of teachers trained on partitions of private data.", "abstract": "Some machine learning applications involve training data that is sensitive, such\nas the medical histories of patients in a clinical trial. A model may\ninadvertently and implicitly store some of its training data; careful analysis\nof the model may therefore reveal sensitive information.\n\nTo address this problem, we demonstrate a generally applicable approach to\nproviding strong privacy guarantees for training data: Private Aggregation of Teacher Ensembles (PATE). The approach combines, in\na black-box fashion, multiple models trained with disjoint datasets, such as\nrecords from different subsets of users. Because they rely directly on sensitive\ndata, these models are not published, but instead used as ''teachers'' for a ''student'' model. \nThe student learns to predict an output chosen by noisy voting\namong all of the teachers, and cannot directly access an individual teacher or\nthe underlying data or parameters. The student's privacy properties can be\nunderstood both intuitively (since no single teacher and thus no single dataset\ndictates the student's training) and formally, in terms of differential privacy.\n These properties hold even if an adversary can not only query the student but\nalso inspect its internal workings.\n\nCompared with previous work, the approach imposes only weak assumptions on how\nteachers are trained: it applies to any model, including non-convex models like\nDNNs. We achieve state-of-the-art privacy/utility trade-offs on MNIST and SVHN\nthanks to an improved privacy analysis and semi-supervised learning.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicolas Papernot;Mart\u00edn Abadi;\u00dalfar Erlingsson;Ian Goodfellow;Kunal Talwar", "authorids": "ngp5056@cse.psu.edu;abadi@google.com;ulfar@google.com;ian@openai.com;kunal@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\npapernot2017semisupervised,\ntitle={Semi-supervised Knowledge Transfer for Deep Learning from Private Training Data},\nauthor={Nicolas Papernot and Mart{\\'\\i}n Abadi and {\\'U}lfar Erlingsson and Ian Goodfellow and Kunal Talwar},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HkwoSDPgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkwoSDPgg", "pdf_size": 0, "rating": "7;9;9", "confidence": "3;4;4", "rating_avg": 8.333333333333334, "confidence_avg": 3.6666666666666665, "replies_avg": 22, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1306, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7453137533162499463&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "HkxAAvcxx", "title": "Transformation-based Models of Video Sequences", "track": "main", "status": "Reject", "tldr": "Predict next frames of a video sequence by modelling transformations", "abstract": "In this work we propose a simple unsupervised approach for next frame prediction in video. Instead of directly predicting the pixels in a frame given past frames, we predict the transformations needed for generating the next frame in a sequence, given the transformations of the past frames. This leads to sharper results, while using a smaller prediction model.\n\nIn order to enable a fair comparison between different video frame prediction models, we also propose a new evaluation protocol. We use generated frames as input to a classifier trained with ground truth sequences. This criterion guarantees that models scoring high are those producing sequences which preserve discrim- inative features, as opposed to merely penalizing any deviation, plausible or not, from the ground truth. Our proposed approach compares favourably against more sophisticated ones on the UCF-101 data set, while also being more efficient in terms of the number of parameters and computational cost.", "keywords": "Computer vision;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Joost van Amersfoort;Anitha Kannan;Marc'Aurelio Ranzato;Arthur Szlam;Du Tran;Soumith Chintala", "authorids": "joost@joo.st;akannan@fb.com;ranzato@fb.com;aszlam@fb.com;trandu@fb.com;soumith@fb.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\namersfoort2017transformationbased,\ntitle={Transformation-based Models of Video Sequences},\nauthor={Joost van Amersfoort and Anitha Kannan and Marc'Aurelio Ranzato and Arthur Szlam and Du Tran and Soumith Chintala},\nyear={2017},\nurl={https://openreview.net/forum?id=HkxAAvcxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HkxAAvcxx", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 14, "authors#_avg": 6, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12152805440849739187&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HkyYqU9lx", "title": "Sequence to Sequence Transduction with Hard Monotonic Attention", "track": "main", "status": "Reject", "tldr": "Sequence to sequence learning with a hard attention mechanism that works better than soft attention models on monotonically aligned sequences", "abstract": "We present a supervised sequence to sequence transduction model with a hard attention mechanism which combines the more traditional statistical alignment methods with the power of recurrent neural networks. We evaluate the model on the task of morphological inflection generation and show that it provides state of the art results in various setups compared to the previous neural and non-neural approaches. Eventually we present an analysis of the learned representations for both hard and soft attention models, shedding light on the features such models extract in order to solve the task.", "keywords": "Natural language processing;Applications", "primary_area": "", "supplementary_material": "", "author": "Roee Aharoni;Yoav Goldberg", "authorids": "roee.aharoni@gmail.com;yoav.goldberg@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\naharoni2017sequence,\ntitle={Sequence to Sequence Transduction with Hard Monotonic Attention},\nauthor={Roee Aharoni and Yoav Goldberg},\nyear={2017},\nurl={https://openreview.net/forum?id=HkyYqU9lx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkyYqU9lx", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5214154174212152908&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Hkz6aNqle", "title": "Deep Error-Correcting Output Codes", "track": "main", "status": "Reject", "tldr": "", "abstract": "Existing deep networks are generally initialized with unsupervised methods, such as random assignments and greedy layerwise pre-training. This may result in the whole training process (initialization/pre-training + fine-tuning) to be very time consuming. In this paper, we combine the ideas of ensemble learning and deep learning, and present a novel deep learning framework called deep error-correcting output codes (DeepECOC). DeepECOC are composed of multiple layers of the ECOC module, which combines multiple binary classifiers for feature learning. Here, the weights learned for the binary classifiers can be considered as weights between two successive layers, while the outputs of the combined binary classifiers as the outputs of a hidden layer. On the one hand, the ECOC modules can be learned using given supervisory information, and on the other hand, based on the ternary coding design, the weights can be learned only using part of the training data. Hence, the supervised pre-training of DeepECOC is in general very effective and efficient. We have conducted extensive experiments to compare DeepECOC with traditional ECOC, feature learning and deep learning algorithms on several benchmark data sets. The results demonstrate that DeepECOC perform not only better than traditional ECOC and feature learning algorithms, but also state-of the-art deep learning models in most cases. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guoqiang Zhong;Yuchen Zheng;Peng Zhang;Mengqi Li;Junyu Dong", "authorids": "gqzhong@ouc.edu.cn;ouczyc@outlook.com;sdrzbruce@163.com;enri9615@outlook.com;dongjunyu@ouc.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhong2017deep,\ntitle={Deep Error-Correcting Output Codes},\nauthor={Guoqiang Zhong and Yuchen Zheng and Peng Zhang and Mengqi Li and Junyu Dong},\nyear={2017},\nurl={https://openreview.net/forum?id=Hkz6aNqle}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hkz6aNqle", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=628470039589928376&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "HkzuKpLgg", "title": "Efficient Communications in Training Large Scale Neural Networks", "track": "main", "status": "Reject", "tldr": "Tackle the communications in the parallel training of neural networks", "abstract": "We consider the problem of how to reduce the cost of communication that is re- quired for the parallel training of a neural network. The state-of-the-art method, Bulk Synchronous Parallel Stochastic Gradient Descent (BSP-SGD), requires a many collective communication operations, like broadcasts of parameters or reduc- tions for sub-gradient aggregations, which for large messages quickly dominates overall execution time and limits parallel scalability. To address this problem, we develop a new technique for collective operations, referred to as Linear Pipelining (LP). It is tuned to the message sizes that arise in BSP-SGD, and works effectively on multi-GPU systems. Theoretically, the cost of LP is invariant to P , where P is the number of GPUs, while the cost of more conventional Minimum Spanning Tree (MST) scales like O(log P ). LP also demonstrate up to 2x faster bandwidth than Bidirectional Exchange (BE) techniques that are widely adopted by current MPI implementations. We apply these collectives to BSP-SGD, showing that the proposed implementations reduce communication bottlenecks in practice while preserving the attractive convergence properties of BSP-SGD.", "keywords": "Applications;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Linnan Wang;Wei Wu;George Bosilca;Richard Vuduc;Zenglin Xu", "authorids": "linnan.wang@gatech.edu;wwu12@vols.utk.edu;bosilca@icl.utk.edu;richie@cc.gatech.edu;zlxu@uestc.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nwang2017efficient,\ntitle={Efficient Communications in Training Large Scale Neural Networks},\nauthor={Linnan Wang and Wei Wu and George Bosilca and Richard Vuduc and Zenglin Xu},\nyear={2017},\nurl={https://openreview.net/forum?id=HkzuKpLgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HkzuKpLgg", "pdf_size": 0, "rating": "5;5", "confidence": "3;5", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15391234175320619963&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "Hy-2G6ile", "title": "Gated Multimodal Units for Information Fusion", "track": "main", "status": "Workshop", "tldr": "Gated Multimodal Units: a novel unit that learns to combine multiple modalities using multiplicative gates", "abstract": "This paper presents a novel model for multimodal learning based on gated neural networks. The Gated Multimodal Unit (GMU) model is intended to be used as an internal unit in a neural network architecture whose purpose is to find an intermediate representation based on a combination of data from different modalities. The GMU learns to decide how modalities influence the activation of the unit using multiplicative gates. It was evaluated on a multilabel scenario for genre classification of movies using the plot and the poster. The GMU improved the macro f-score performance of single-modality approaches and outperformed other fusion strategies, including mixture of experts models. Along with this work, the MM-IMDb dataset is released which, to the best of our knowledge, is the largest publicly available multimodal dataset for genre prediction on movies.", "keywords": "Multi-modal learning;Applications;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "John Arevalo;Thamar Solorio;Manuel Montes-y-G\u00f3mez;Fabio A. Gonz\u00e1lez", "authorids": "jearevaloo@unal.edu.co;solorio@cs.uh.edu;smmontesg@inaoep.mx;fagonzalezo@unal.edu.co", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hy-2G6ile", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.3273268353539886, "gs_citation": 486, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18206021174214727264&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Hy-lMNqex", "title": "Tartan: Accelerating Fully-Connected and Convolutional Layers in Deep Learning Networks by Exploiting Numerical Precision Variability", "track": "main", "status": "Reject", "tldr": "A hardware accelerator whose execution time for Fully-Connected and Convolutional Layers in CNNs vary inversely proportional with the number of bits used to represent the input activations and/or weights.", "abstract": "Tartan {TRT} a hardware accelerator for inference with Deep Neural Networks (DNNs) is presented and evaluated on Convolutional Neural Networks. TRT exploits the variable per layer precision requirements of DNNs to deliver execution time that is proportional to the precision p in bits used per layer for convolutional and fully-connected layers. Prior art has demonstrated an accelerator with the same execution performance only for convolutional layers. Experiments on image classification CNNs show that on average across all networks studied, TRT outperforms a state-of-the-art bit-parallel accelerator by 1.90x without any loss in accuracy while it is 1.17x more energy efficient. TRT requires no network retraining while it enables trading off accuracy for additional improvements in execution performance and energy efficiency. For example, if a 1% relative loss in accuracy is acceptable, TRT is on average 2.04x faster and 1.25x more energy efficient than the bit-parallel accelerator.\nThis revision includes post-layout results and a better configuration that processes 2bits at time resulting in better efficiency and lower area overhead.", "keywords": "Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Alberto Delm\u00e1s Lascorz;Sayeh Sharify;Patrick Judd;Andreas Moshovos", "authorids": ";;;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlascorz2017tartan,\ntitle={Tartan: Accelerating Fully-Connected and Convolutional Layers in Deep Learning Networks by Exploiting Numerical Precision Variability},\nauthor={Alberto Delm{\\'a}s Lascorz and Sayeh Sharify and Patrick Judd and Andreas Moshovos},\nyear={2017},\nurl={https://openreview.net/forum?id=Hy-lMNqex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer5;AnonReviewer1", "site": "https://openreview.net/forum?id=Hy-lMNqex", "pdf_size": 0, "rating": "4;4;5;5;6", "confidence": "1;3;5;5;2", "rating_avg": 4.8, "confidence_avg": 3.2, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": 0.2004459314343183, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14267949647483102443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Hy0L4t5el", "title": "Tree-Structured Variational Autoencoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many kinds of variable-sized data we would like to model contain an internal hierarchical structure in the form of a tree, including source code, formal logical statements, and natural language sentences with parse trees. For such data it is natural to consider a model with matching computational structure. In this work, we introduce a variational autoencoder-based generative model for tree-structured data. We evaluate our model on a synthetic dataset, and a dataset with applications to automated theorem proving. By learning a latent representation over trees, our model can achieve similar test log likelihood to a standard autoregressive decoder, but with the number of sequentially dependent computations proportional to the depth of the tree instead of the number of nodes in the tree.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Richard Shin;Alexander A. Alemi;Geoffrey Irving;Oriol Vinyals", "authorids": "ricshin@cs.berkeley.edu;alemi@google.com;geoffreyi@google.com;vinyals@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshin2017treestructured,\ntitle={Tree-Structured Variational Autoencoder},\nauthor={Richard Shin and Alexander A. Alemi and Geoffrey Irving and Oriol Vinyals},\nyear={2017},\nurl={https://openreview.net/forum?id=Hy0L4t5el}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Hy0L4t5el", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4905622326176822135&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "Hy3_KuYxg", "title": "Divide and Conquer with Neural Networks", "track": "main", "status": "Reject", "tldr": "learn dynamic programming with neural networks", "abstract": "We consider the learning of algorithmic tasks by mere observation of input-output pairs. \nRather than studying this as a black-box discrete regression problem with no assumption whatsoever \non the input-output mapping, we concentrate on tasks that are amenable to the principle of divide and conquer, and study what are its implications in terms of learning. \n\nThis principle creates a powerful inductive bias that we exploit with neural architectures that are defined recursively, by learning two scale-invariant atomic operators: how to split a given input into two disjoint sets, and how to merge two partially solved tasks into a larger partial solution. The scale invariance creates parameter sharing across all stages of the architecture, and the dynamic design creates architectures whose complexity can be tuned in a differentiable manner.\n\nAs a result, our model is trained by backpropagation not only to minimize the errors at the output, but also to do so as efficiently as possible, by enforcing shallower computation graphs. Moreover, thanks to the scale invariance, the model can be trained only with only input/output pairs, removing the need to know oracle intermediate split and merge decisions. As it turns out, accuracy and complexity are not independent qualities, and we verify empirically that when the learnt complexity matches the underlying complexity of the task, this results in higher accuracy and better generalization in two paradigmatic problems: sorting and finding planar convex hulls.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Alex Nowak;Joan Bruna", "authorids": "anv273@nyu.edu;bruna@cims.nyu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnowak2017divide,\ntitle={Divide and Conquer with Neural Networks},\nauthor={Alex Nowak and Joan Bruna},\nyear={2017},\nurl={https://openreview.net/forum?id=Hy3_KuYxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hy3_KuYxg", "pdf_size": 0, "rating": "3;4;4", "confidence": "2;4;2", "rating_avg": 3.6666666666666665, "confidence_avg": 2.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2568837392349793356&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Hy6b4Pqee", "title": "Deep Probabilistic Programming", "track": "main", "status": "Poster", "tldr": "", "abstract": "We propose Edward, a Turing-complete probabilistic programming language. Edward defines two compositional representations\u2014random variables and inference. By treating inference as a first class citizen, on a par with modeling, we show that probabilistic programming can be as flexible and computationally efficient as traditional deep learning. For flexibility, Edward makes it easy to fit the same model using a variety of composable inference methods, ranging from point estimation to variational inference to MCMC. In addition, Edward can reuse the modeling representation as part of inference, facilitating the design of rich variational models and generative adversarial networks. For efficiency, Edward is integrated into TensorFlow, providing significant speedups over existing probabilistic systems. For example, we show on a benchmark logistic regression task that Edward is at least 35x faster than Stan and 6x faster than PyMC3. Further, Edward incurs no runtime overhead: it is as fast as handwritten TensorFlow.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dustin Tran;Matthew D. Hoffman;Rif A. Saurous;Eugene Brevdo;Kevin Murphy;David M. Blei", "authorids": "dustin@cs.columbia.edu;mathoffm@adobe.com;rif@google.com;ebrevdo@google.com;kpmurphy@google.com;david.blei@columbia.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ntran2017deep,\ntitle={Deep Probabilistic Programming},\nauthor={Dustin Tran and Matthew D. Hoffman and Rif A. Saurous and Eugene Brevdo and Kevin Murphy and David M. Blei},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hy6b4Pqee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Hy6b4Pqee", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 249, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3635651340212310579&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "Hy8X3aKee", "title": "Deep Symbolic Representation Learning for Heterogeneous Time-series Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we consider the problem of event classification with multi-variate time series data consisting of heterogeneous (continuous and categorical) variables. The complex temporal dependencies between the variables combined with sparsity of the data makes the event classification problem particularly challenging. Most state-of-art approaches address this either by designing hand-engineered features or breaking up the problem over homogeneous variates. In this work, we propose and compare three representation learning algorithms over symbolized sequences which enables classification of heterogeneous time-series data using a deep architecture. The proposed representations are trained jointly along with the rest of the network architecture in an end-to-end fashion that makes the learned features discriminative for the given task. Experiments on three real-world datasets demonstrate the effectiveness of the proposed approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shengdong Zhang;Soheil Bahrampour;Naveen Ramakrishnan;Mohak Shah", "authorids": "zhangshengdongofgz@gmail.com;Soheil.Bahrampour@us.bosch.com;Naveen.Ramakrishnan@us.bosch.com;mohak@mohakshah.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2017deep,\ntitle={Deep Symbolic Representation Learning for Heterogeneous Time-series Classification},\nauthor={Shengdong Zhang and Soheil Bahrampour and Naveen Ramakrishnan and Mohak Shah},\nyear={2017},\nurl={https://openreview.net/forum?id=Hy8X3aKee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hy8X3aKee", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;4", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3880501317099363296&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HyAbMKwxe", "title": "Tighter bounds lead to improved classifiers", "track": "main", "status": "Poster", "tldr": "", "abstract": "The standard approach to supervised classification involves the minimization of a log-loss as an upper bound to the classification error. While this is a tight bound early on in the optimization, it overemphasizes the influence of incorrectly classified examples far from the decision boundary. Updating the upper bound during the optimization leads to improved classification rates while transforming the learning into a sequence of minimization problems. In addition, in the context where the classifier is part of a larger system, this modification makes it possible to link the performance of the classifier to that of the whole system, allowing the seamless introduction of external constraints.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicolas Le Roux", "authorids": "nicolas@le-roux.name", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nroux2017tighter,\ntitle={Tighter bounds lead to improved classifiers},\nauthor={Nicolas Le Roux},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HyAbMKwxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyAbMKwxe", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;4;5", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 1, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15690775714191325520&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HyAddcLge", "title": "Revisiting Distributed Synchronous SGD", "track": "main", "status": "Reject", "tldr": "We proposed distributed synchronous stochastic optimization with backup workers, and show that it converge faster and to better test accuracies.", "abstract": "Distributed training of deep learning models on large-scale training data is typically conducted with asynchronous stochastic optimization to maximize the rate of updates, at the cost of additional noise introduced from asynchrony. In contrast, the synchronous approach is often thought to be impractical due to idle time wasted on waiting for straggling workers. We revisit these conventional beliefs in this paper, and examine the weaknesses of both approaches. We demonstrate that a third approach, synchronous optimization with backup workers, can avoid asynchronous noise while mitigating for the worst stragglers. Our approach is empirically validated and shown to converge faster and to better test accuracies.\n", "keywords": "Optimization;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Jianmin Chen*;Xinghao Pan*;Rajat Monga;Samy Bengio;Rafal Jozefowicz", "authorids": "jmchen@google.com;xinghao@google.com;rajatmonga@google.com;bengio@google.com;rafal@openai.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nchen*2017revisiting,\ntitle={Revisiting Distributed Synchronous {SGD}},\nauthor={Jianmin Chen* and Xinghao Pan* and Rajat Monga and Samy Bengio and Rafal Jozefowicz},\nyear={2017},\nurl={https://openreview.net/forum?id=HyAddcLge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=HyAddcLge", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 19, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 980, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12724297556956064167&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15 }, { "id": "HyCRyS9gx", "title": "Fast Adaptation in Generative Models with Generative Matching Networks", "track": "main", "status": "Reject", "tldr": "A nonparametric conditional generative model with fast small-shot adaptation", "abstract": "Despite recent advances, the remaining bottlenecks in deep generative models are necessity of extensive training and difficulties with generalization from small number of training examples.\nBoth problems may be addressed by conditional generative models that are trained to adapt the generative distribution to additional input data.\nSo far this idea was explored only under certain limitations such as restricting the input data to be a single object or multiple objects representing the same concept. \nIn this work we develop a new class of deep generative model called generative matching networks which is inspired by the recently proposed matching networks for one-shot learning in discriminative tasks and the ideas from meta-learning.\nBy conditioning on the additional input dataset, generative matching networks may instantly learn new concepts that were not available during the training but conform to a similar generative process, without explicit limitations on the number of additional input objects or the number of concepts they represent. \nOur experiments on the Omniglot dataset demonstrate that generative matching networks can significantly improve predictive performance on the fly as more additional data is available to the model and also adapt the latent space which is beneficial in the context of feature extraction.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Sergey Bartunov;Dmitry P. Vetrov", "authorids": "sbos.net@gmail.com;vetrovd@yandex.ru", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbartunov2017fast,\ntitle={Fast Adaptation in Generative Models with Generative Matching Networks},\nauthor={Sergey Bartunov and Dmitry P. Vetrov},\nyear={2017},\nurl={https://openreview.net/forum?id=HyCRyS9gx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyCRyS9gx", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.18898223650461363, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8540429090683982391&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "HyET6tYex", "title": "Universality in halting time", "track": "main", "status": "Reject", "tldr": "Normalized halting time distributions are independent of the input data distribution.", "abstract": "The authors present empirical distributions for the halting time (measured by the number of iterations to reach a given accuracy) of optimization algorithms applied to two random systems: spin glasses and deep learning. Given an algorithm, which we take to be both the optimization routine and the form of the random landscape, the fluctuations of the halting time follow a distribution that remains unchanged even when the input is changed drastically. We observe two main classes, a Gumbel-like distribution that appears in Google searches, human decision times, QR factorization and spin glasses, and a Gaussian-like distribution that appears in conjugate gradient method, deep network with MNIST input data and deep network with random input data. This empirical evidence suggests presence of a class of distributions for which the halting time is independent of the underlying distribution under some conditions.", "keywords": "Optimization", "primary_area": "", "supplementary_material": "", "author": "Levent Sagun;Thomas Trogdon;Yann LeCun", "authorids": "leventsagun@gmail.com;tom.trogdon@gmail.com;yann@cs.nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsagun2017universality,\ntitle={Universality in halting time},\nauthor={Levent Sagun and Thomas Trogdon and Yann LeCun},\nyear={2017},\nurl={https://openreview.net/forum?id=HyET6tYex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=HyET6tYex", "pdf_size": 0, "rating": "2;5;5", "confidence": "4;3;4", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AQFkKO7ahAsJ:scholar.google.com/&scioq=Universality+in+halting+time&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HyEeMu_xx", "title": "Progressive Attention Networks for Visual Attribute Prediction", "track": "main", "status": "Reject", "tldr": "Progressive attention model that accurately attends to the target objects of various scales and shapes through multiple CNN layers.", "abstract": "We propose a novel attention model which can accurately attend to target objects of various scales and shapes in images. The model is trained to gradually suppress irrelevant regions in an input image via a progressive attentive process over multiple layers of a convolutional neural network. The attentive process in each layer determines whether to pass or suppress features at certain spatial locations for use in the next layer. We further employ local contexts to estimate attention probability at each location since it is difficult to infer accurate attention by observing a feature vector from a single location only. The experiments on synthetic and real datasets show that the proposed attention network outperforms traditional attention methods in visual attribute prediction tasks.", "keywords": "Deep learning;Computer vision;Multi-modal learning", "primary_area": "", "supplementary_material": "", "author": "Paul Hongsuck Seo;Zhe Lin;Scott Cohen;Xiaohui Shen;Bohyung Han", "authorids": "hsseo@postech.ac.kr;zlin@adobe.com;scohen@adobe.com;xshen@adobe.com;bhhan@postech.ac.kr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nseo2017progressive,\ntitle={Progressive Attention Networks for Visual Attribute Prediction},\nauthor={Paul Hongsuck Seo and Zhe Lin and Scott Cohen and Xiaohui Shen and Bohyung Han},\nyear={2017},\nurl={https://openreview.net/forum?id=HyEeMu_xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyEeMu_xx", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": -0.6546536707079772, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13080679789625496701&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HyFkG45gl", "title": "Machine Solver for Physics Word Problems", "track": "main", "status": "Reject", "tldr": "We build an automated solver for a class of physics word problems, using a combination of neural networks and a numerical integrator.", "abstract": "We build a machine solver for word problems on the physics of a free\nfalling object under constant acceleration of gravity. Each problem\nconsists of a formulation part, describing the setting, and a question\npart asking for the value of an unknown. Our solver consists of\ntwo long short-term memory recurrent neural networks and a numerical\nintegrator. The first neural network (the labeler) labels each\nword of the problem, identifying the physical parameters and the\nquestion part of the problem. The second neural network (the \nclassifier) identifies what is being asked in the question. Using\nthe information extracted by both networks, the numerical integrator\ncomputes the solution. We observe that the classifier is resilient\nto errors made by the labeler, which does a better job of identifying\nthe physics parameters than the question. Training, validation and test\nsets of problems are generated from a grammar, with validation and test\nproblems structurally different from the training problems. The overall\naccuracy of the solver on the test cases is 99.8%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Megan Leszczynski;Jose Moreira", "authorids": "mel255@cornell.edu;jmoreira@us.ibm.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nleszczynski2017machine,\ntitle={Machine Solver for Physics Word Problems},\nauthor={Megan Leszczynski and Jose Moreira},\nyear={2017},\nurl={https://openreview.net/forum?id=HyFkG45gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyFkG45gl", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6772748326142614450&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HyGTuv9eg", "title": "Incorporating long-range consistency in CNN-based texture generation", "track": "main", "status": "Poster", "tldr": "We propose a simple extension to the Gatys et al. algorithm which makes it possible to incorporate long-range structure into texture generation.", "abstract": "Gatys et al. (2015) showed that pair-wise products of features in a convolutional network are a very effective representation of image textures. We propose a simple modification to that representation which makes it possible to incorporate long-range structure into image generation, and to render images that satisfy various symmetry constraints. We show how this can greatly improve rendering of regular textures and of images that contain other kinds of symmetric structure. We also present applications to inpainting and season transfer.", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Guillaume Berger;Roland Memisevic", "authorids": "guillaume.berger@umontreal.ca;memisevr@iro.umontreal.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nberger2017incorporating,\ntitle={Incorporating long-range consistency in {CNN}-based texture generation},\nauthor={Guillaume Berger and Roland Memisevic},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HyGTuv9eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyGTuv9eg", "pdf_size": 0, "rating": "5;7;7", "confidence": "5;5;5", "rating_avg": 6.333333333333333, "confidence_avg": 5.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5240051382675709653&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3 }, { "id": "HyM25Mqel", "title": "Sample Efficient Actor-Critic with Experience Replay", "track": "main", "status": "Poster", "tldr": "Prepared for ICLR 2017.", "abstract": "This paper presents an actor-critic deep reinforcement learning agent with experience replay that is stable, sample efficient, and performs remarkably well on challenging environments, including the discrete 57-game Atari domain and several continuous control problems. To achieve this, the paper introduces several innovations, including truncated importance sampling with bias correction, stochastic dueling network architectures, and a new trust region policy optimization method.", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Ziyu Wang;Victor Bapst;Nicolas Heess;Volodymyr Mnih;Remi Munos;Koray Kavukcuoglu;Nando de Freitas", "authorids": "ziyu@google.com;vbapst@google.com;heess@google.com;vmnih@google.com;Munos@google.com;korayk@google.com;nandodefreitas@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nwang2017sample,\ntitle={Sample Efficient Actor-Critic with Experience Replay},\nauthor={Ziyu Wang and Victor Bapst and Nicolas Heess and Volodymyr Mnih and Remi Munos and Koray Kavukcuoglu and Nando de Freitas},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HyM25Mqel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyM25Mqel", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 19, "authors#_avg": 7, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1079, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8369222693188103740&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "HyNxRZ9xg", "title": "Cat2Vec: Learning Distributed Representation of Multi-field Categorical Data", "track": "main", "status": "Reject", "tldr": "an unsupervised pairwise interaction model to learning the distributed representation of multi-field categorical data", "abstract": "This paper presents a method of learning distributed representation for multi-field categorical data, which is a common data format with various applications such as recommender systems, social link prediction, and computational advertising. The success of non-linear models, e.g., factorisation machines, boosted trees, has proved the potential of exploring the interactions among inter-field categories. Inspired by Word2Vec, the distributed representation for natural language, we propose Cat2Vec (categories to vectors) model. In Cat2Vec, a low-dimensional continuous vector is automatically learned for each category in each field. The interactions among inter-field categories are further explored by different neural gates and the most informative ones are selected by pooling layers. In our experiments, with the exploration of the interactions between pairwise categories over layers, the model attains great improvement over state-of-the-art models in a supervised learning task, e.g., click prediction, while capturing the most significant interactions from the data. ", "keywords": "Unsupervised Learning;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Ying Wen;Jun Wang;Tianyao Chen;Weinan Zhang", "authorids": "ying.wen@cs.ucl.ac.uk;jun.wang@cs.ucl.ac.uk;tychen@apex.sjtu.edu.cn;wnzhang@apex.sjtu.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwen2017catvec,\ntitle={Cat2Vec: Learning Distributed Representation of Multi-field Categorical Data},\nauthor={Ying Wen and Jun Wang and Tianyao Chen and Weinan Zhang},\nyear={2017},\nurl={https://openreview.net/forum?id=HyNxRZ9xg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HyNxRZ9xg", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1681536775051809196&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HyQJ-mclg", "title": "Incremental Network Quantization: Towards Lossless CNNs with Low-precision Weights", "track": "main", "status": "Poster", "tldr": "This paper presents INQ, targeting to efficiently transform any pre-trained full-precision convolutional neural network (CNN) model into a low-precision version whose connection weights are constrained to be either powers of two or zero.", "abstract": "This paper presents incremental network quantization (INQ), a novel method, targeting to efficiently convert any pre-trained full-precision convolutional neural network (CNN) model into a low-precision version whose weights are constrained to be either powers of two or zero. Unlike existing methods which are struggled in noticeable accuracy loss, our INQ has the potential to resolve this issue, as benefiting from two innovations. On one hand, we introduce three interdependent operations, namely weight partition, group-wise quantization and re-training. A well-proven measure is employed to divide the weights in each layer of a pre-trained CNN model into two disjoint groups. The weights in the first group are responsible to form a low-precision base, thus they are quantized by a variable-length encoding method. The weights in the other group are responsible to compensate for the accuracy loss from the quantization, thus they are the ones to be re-trained. On the other hand, these three operations are repeated on the latest re-trained group in an iterative manner until all the weights are converted into low-precision ones, acting as an incremental network quantization and accuracy enhancement procedure. Extensive experiments on the ImageNet classification task using almost all known deep CNN architectures including AlexNet, VGG-16, GoogleNet and ResNets well testify the efficacy of the proposed method. Specifically, at 5-bit quantization (a variable-length encoding: 1 bit for representing zero value, and the remaining 4 bits represent at most 16 different values for the powers of two), our models have improved accuracy than the 32-bit floating-point references. Taking ResNet-18 as an example, we further show that our quantized models with 4-bit, 3-bit and 2-bit ternary weights have improved or very similar accuracy against its 32-bit floating-point baseline. Besides, impressive results with the combination of network pruning and INQ are also reported. We believe that our method sheds new insights on how to make deep CNNs to be applicable on mobile or embedded devices. The code will be made publicly available.", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Aojun Zhou;Anbang Yao;Yiwen Guo;Lin Xu;Yurong Chen", "authorids": "aojun.zhou@intel.com;anbang.yao@intel.com;yiwen.guo@intel.com;lin.x.xu@intel.com;yurong.chen@intel.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzhou2017incremental,\ntitle={Incremental Network Quantization: Towards Lossless {CNN}s with Low-precision Weights},\nauthor={Aojun Zhou and Anbang Yao and Yiwen Guo and Lin Xu and Yurong Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HyQJ-mclg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyQJ-mclg", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1390, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10552103322105352604&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HyQWFOVge", "title": "Significance of Softmax-Based Features over Metric Learning-Based Features", "track": "main", "status": "Reject", "tldr": "We show softmax-based features are markedly better than state-of-the-art metric learning-based features by conducting fair comparison between them.", "abstract": "The extraction of useful deep features is important for many computer vision tasks.\nDeep features extracted from classification networks have proved to perform well in those tasks.\nTo obtain features of greater usefulness, end-to-end distance metric learning (DML) has been applied to train the feature extractor directly.\nEnd-to-end DML approaches such as Magnet Loss and lifted structured feature embedding show state-of-the-art performance in several image recognition tasks.\nHowever, in these DML studies, there were no equitable comparisons between features extracted from a DML-based network and those from a softmax-based network.\nIn this paper, by presenting objective comparisons between these two approaches under the same network architecture, we show that the softmax-based features are markedly better than the state-of-the-art DML features for tasks such as fine-grained recognition, attribute estimation, clustering, and retrieval.", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Shota Horiguchi;Daiki Ikami;Kiyoharu Aizawa", "authorids": "horiguchi@hal.t.u-tokyo.ac.jp;ikami@hal.t.u-tokyo.ac.jp;aizawa@hal.t.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhoriguchi2017significance,\ntitle={Significance of Softmax-Based Features over Metric Learning-Based Features},\nauthor={Shota Horiguchi and Daiki Ikami and Kiyoharu Aizawa},\nyear={2017},\nurl={https://openreview.net/forum?id=HyQWFOVge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyQWFOVge", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 24, "authors#_avg": 3, "corr_rating_confidence": -0.7559289460184544, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4423507220937998934&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HyTqHL5xg", "title": "Deep Variational Bayes Filters: Unsupervised Learning of State Space Models from Raw Data", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce Deep Variational Bayes Filters (DVBF), a new method for unsupervised learning and identification of latent Markovian state space models. Leveraging recent advances in Stochastic Gradient Variational Bayes, DVBF can overcome intractable inference distributions via variational inference. Thus, it can handle highly nonlinear input data with temporal and spatial dependencies such as image sequences without domain knowledge. Our experiments show that enabling backpropagation through transitions enforces state space assumptions and significantly improves information content of the latent embedding. This also enables realistic long-term prediction.\n", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Maximilian Karl;Maximilian Soelch;Justin Bayer;Patrick van der Smagt", "authorids": "karlma@in.tum.de;m.soelch@tum.de;bayer.justin@googlemail.com;smagt@brml.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nkarl2017deep,\ntitle={Deep Variational Bayes Filters: Unsupervised Learning of State Space Models from Raw Data},\nauthor={Maximilian Karl and Maximilian Soelch and Justin Bayer and Patrick van der Smagt},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HyTqHL5xg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyTqHL5xg", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 484, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6648795604218524959&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "HyWDCXjgx", "title": "Multi-label learning with the RNNs for Fashion Search", "track": "main", "status": "Reject", "tldr": "Works for applying LSTM into the multi-label learning in an application to computer vision", "abstract": "We build a large-scale visual search system which finds similar product images given a fashion item. Defining similarity among arbitrary fashion-products is still remains a challenging problem, even there is no exact ground-truth. To resolve this problem, we define more than 90 fashion-related attributes, and combination of these attributes can represent thousands of unique fashion-styles. We then introduce to use the recurrent neural networks (RNNs) recognising multi fashion-attributes with the end-to-end manner. To build our system at scale, these fashion-attributes are again used to build an inverted indexing scheme. In addition to these fashion-attributes for semantic similarity, we extract colour and appearance features in a region-of-interest (ROI) of a fashion item for visual similarity. By sharing our approach, we expect active discussion on that how to apply current deep learning researches into the e-commerce industry.", "keywords": "Computer vision;Deep learning;Supervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Taewan Kim", "authorids": "taey.16@navercorp.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nkim2017multilabel,\ntitle={Multi-label learning with the {RNN}s for Fashion Search},\nauthor={Taewan Kim},\nyear={2017},\nurl={https://openreview.net/forum?id=HyWDCXjgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyWDCXjgx", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 1, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7745137634722424236&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HyWG0H5ge", "title": "Neural Taylor Approximations: Convergence and Exploration in Rectifier Networks", "track": "main", "status": "Workshop", "tldr": "We provide the first convergence result for rectifier neural networks and investigate implications for exploration in shattered landscapes.", "abstract": "Modern convolutional networks, incorporating rectifiers and max-pooling, are neither smooth nor convex. Standard guarantees therefore do not apply. Nevertheless, methods from convex optimization such as gradient descent and Adam are widely used as building blocks for deep learning algorithms. This paper provides the first convergence guarantee applicable to modern convnets. The guarantee matches a lower bound for convex nonsmooth functions. The key technical tool is the neural Taylor approximation -- a straightforward application of Taylor expansions to neural networks -- and the associated Taylor loss. Experiments on a range of optimizers, layers, and tasks provide evidence that the analysis accurately captures the dynamics of neural optimization.\n\nThe second half of the paper applies the Taylor approximation to isolate the main difficulty in training rectifier nets: that gradients are shattered. We investigate the hypothesis that, by exploring the space of activation configurations more thoroughly, adaptive optimizers such as RMSProp and Adam are able to converge to better solutions.", "keywords": "Deep learning;Optimization;Theory;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "David Balduzzi;Brian McWilliams;Tony Butler-Yeoman", "authorids": "david.balduzzi@vuw.ac.nz;brian@disneyresearch.com;butlertony@ecs.vuw.ac.nz", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyWG0H5ge", "pdf_size": 0, "rating": "3;7;7", "confidence": "4;3;2", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15651118293090411832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HyWWpw5ex", "title": "Recurrent Coevolutionary Feature Embedding Processes for Recommendation", "track": "main", "status": "Reject", "tldr": "Our work combines recurrent neural network with point process models for recommendation, which captures the co-evolution nature of users' and items' latent features.", "abstract": "Recommender systems often use latent features to explain the behaviors of users and capture the properties of items. As users interact with different items over time, user and item features can influence each other, evolve and co-evolve over time. To accurately capture the fine grained nonlinear coevolution of these features, we propose a recurrent coevolutionary feature embedding process model, which combines recurrent neural network (RNN) with a multi-dimensional point process model. The RNN learns a nonlinear representation of user and item embeddings which take into account mutual influence between user and item features, and the feature evolution over time. We also develop an efficient stochastic gradient algorithm for learning parameters. Experiments on diverse real-world datasets demonstrate significant improvements in user behavior prediction compared to state-of-the-arts. ", "keywords": "Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Hanjun Dai*;Yichen Wang*;Rakshit Trivedi;Le Song", "authorids": "hanjundai@gatech.edu;yichen.wang@gatech.edu;rstrivedi@gatech.edu;lsong@cc.gatech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndai*2017recurrent,\ntitle={Recurrent Coevolutionary Feature Embedding Processes for Recommendation},\nauthor={Hanjun Dai* and Yichen Wang* and Rakshit Trivedi and Le Song},\nyear={2017},\nurl={https://openreview.net/forum?id=HyWWpw5ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyWWpw5ex", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HyY4Owjll", "title": "Boosted Generative Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a new approach for using boosting to create an ensemble of generative models, where models are trained in sequence to correct earlier mistakes. Our algorithm can leverage many existing base learners, including recent latent variable models. Further, our approach allows the ensemble to leverage discriminative models trained to distinguish real data from model generated data. We show theoretical conditions under which incorporating a new model to the ensemble will improve the fit and empirically demonstrate the effectiveness of boosting on density estimation and sample generation on real and synthetic datasets.", "keywords": "Theory;Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Aditya Grover;Stefano Ermon", "authorids": "adityag@cs.stanford.edu;ermon@cs.stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngrover2017boosted,\ntitle={Boosted Generative Models},\nauthor={Aditya Grover and Stefano Ermon},\nyear={2017},\nurl={https://openreview.net/forum?id=HyY4Owjll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyY4Owjll", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7173368572009538914&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "Hyanrrqlg", "title": "HFH: Homologically Functional Hashing for Compressing Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "As the complexity of deep neural networks (DNNs) trends to grow to absorb the increasing sizes of data, memory and energy consumption has been receiving more and more attentions for industrial applications, especially on mobile devices. This paper presents a novel structure based on homologically functional hashing to compress DNNs, shortly named as HFH. For each weight entry in a deep net, HFH uses multiple low-cost hash functions to fetch values in a compression space, and then employs a small reconstruction network to recover that entry. The compression space is homological because all layers fetch hashed values from it. The reconstruction network is plugged into the whole network and trained jointly. On several benchmark datasets, HFH demonstrates high compression ratios with little loss on prediction accuracy. Particularly, HFH includes the recently proposed HashedNets as a degenerated scenario and shows significantly improved performance. Moreover, the homological hashing essence facilitates us to efficiently figure out one single desired compression ratio, instead of exhaustive searching throughout a combinatory space configured by all layers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lei Shi;Shikun Feng;Zhifan Zhu", "authorids": "shilei06@baidu.com;fengshikun01@baidu.com;zhuzhifan@baidu.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nshi2017hfh,\ntitle={{HFH}: Homologically Functional Hashing for Compressing Deep Neural Networks},\nauthor={Lei Shi and Shikun Feng and Zhifan Zhu},\nyear={2017},\nurl={https://openreview.net/forum?id=Hyanrrqlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hyanrrqlg", "pdf_size": 0, "rating": "4;5;6", "confidence": "0;5;4", "rating_avg": 5.0, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.7559289460184544, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8kN2elYGuWYJ:scholar.google.com/&scioq=HFH:+Homologically+Functional+Hashing+for+Compressing+Deep+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HycUbvcge", "title": "Deep Generalized Canonical Correlation Analysis", "track": "main", "status": "Reject", "tldr": "A multiview representation learning technique that can learn nonlinear mappings from arbitrarily many views to a shared semantic space -- Deep Generalized Canonical Correlation Analysis.", "abstract": "We present Deep Generalized Canonical Correlation Analysis (DGCCA) \u2013 a method for learning nonlinear transformations of arbitrarily many views of data, such that the resulting transformations are maximally informative of each other. While methods for nonlinear two-view representation learning (Deep CCA, (Andrew et al., 2013)) and linear many-view representation learning (Generalized CCA (Horst, 1961)) exist, DGCCA is the first CCA-style multiview representation learning technique that combines the flexibility of nonlinear (deep) representation learning with the statistical power of incorporating information from many independent sources, or views. We present the DGCCA formulation as well as an efficient stochastic optimization algorithm for solving it. We learn DGCCA representations on two distinct datasets for three downstream tasks: phonetic transcription from acoustic and articulatory measurements, and recommending hashtags and friends on a dataset of Twitter users. We find that DGCCA representations soundly beat existing methods at phonetic transcription and hashtag recommendation, and in general perform no worse than standard linear many-view techniques.", "keywords": "Unsupervised Learning;Deep learning;Multi-modal learning", "primary_area": "", "supplementary_material": "", "author": "Adrian Benton;Huda Khayrallah;Biman Gujral;Drew Reisinger;Sheng Zhang;Raman Arora", "authorids": "adrian@cs.jhu.edu;huda@jhu.edu;bgujral1@jhu.edu;reisinger@cogsci.jhu.edu;zsheng2@jhu.edu;arora@cs.jhu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nbenton2017deep,\ntitle={Deep Generalized Canonical Correlation Analysis},\nauthor={Adrian Benton and Huda Khayrallah and Biman Gujral and Drew Reisinger and Sheng Zhang and Raman Arora},\nyear={2017},\nurl={https://openreview.net/forum?id=HycUbvcge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=HycUbvcge", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;4;3", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 6, "corr_rating_confidence": -1.0, "gs_citation": 199, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14255180452281897227&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "HyecJGP5ge", "title": "NEUROGENESIS-INSPIRED DICTIONARY LEARNING: ONLINE MODEL ADAPTION IN A CHANGING WORLD", "track": "main", "status": "Reject", "tldr": "An online dictionary learning incorporates dynamic model adaptation, adding/deleting its elements in response to nonstationary data.", "abstract": "In this paper, we focus on online representation learning in non-stationary environments which may require continuous adaptation of model\u2019s architecture. We propose a novel online dictionary-learning (sparse-coding) framework which incorporates the addition and deletion of hidden units (dictionary elements), and is inspired by the adult neurogenesis phenomenon in the dentate gyrus of the hippocampus, known to be associated with improved cognitive function and adaptation to new environments. In the online learning setting, where new input instances arrive sequentially in batches, the \u201cneuronal birth\u201d is implemented by adding new units with random initial weights (random dictionary elements); the number of new units is determined by the current performance (representation error) of the dictionary, higher error causing an increase in the birth rate. \u201cNeuronal death\u201d is implemented by imposing l1/l2-regularization (group sparsity) on the dictionary within the block-coordinate descent optimization at each iteration of our online alternating minimization scheme, which iterates between the code and dictionary updates. Finally, hidden unit connectivity adaptation is facilitated by introducing sparsity in dictionary elements. Our empirical evaluation on several real-life datasets (images and language) as well as on synthetic data demonstrates that the proposed approach can considerably outperform the state-of-art fixed-size (nonadaptive) online sparse coding of Mairal et al. (2009) in the presence of nonstationary data. Moreover, we identify certain properties of the data (e.g., sparse inputs with nearly non-overlapping supports) and of the model (e.g., dictionary sparsity) associated with such improvements.", "keywords": "Unsupervised Learning;Computer vision;Transfer Learning;Optimization;Applications", "primary_area": "", "supplementary_material": "", "author": "Sahil Garg;Irina Rish;Guillermo Cecchi;Aurelie Lozano", "authorids": "sahilgar@usc.edu;rish@us.ibm.com;gcecchi@us.ibm.com;aclozano@us.ibm.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngarg2017neurogenesisinspired,\ntitle={{NEUROGENESIS}-{INSPIRED} {DICTIONARY} {LEARNING}: {ONLINE} {MODEL} {ADAPTION} {IN} A {CHANGING} {WORLD}},\nauthor={Sahil Garg and Irina Rish and Guillermo Cecchi and Aurelie Lozano},\nyear={2017},\nurl={https://openreview.net/forum?id=HyecJGP5ge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HyecJGP5ge", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11906953023449684955&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "HyenWc5gx", "title": "Representation Stability as a Regularizer for Improved Text Analytics Transfer Learning", "track": "main", "status": "Reject", "tldr": "We propose a novel general purpose regularizer to address catastrophic forgetting in neural network sequential transfer learning.", "abstract": "Although neural networks are well suited for sequential transfer learning tasks, the catastrophic forgetting problem hinders proper integration of prior knowledge. In this work, we propose a solution to this problem by using a multi-task objective based on the idea of distillation and a mechanism that directly penalizes forgetting at the shared representation layer during the knowledge integration phase of training. We demonstrate our approach on a Twitter domain sentiment analysis task with sequential knowledge transfer from four related tasks. We show that our technique outperforms networks fine-tuned to the target task. Additionally, we show both through empirical evidence and examples that it does not forget useful knowledge from the source task that is forgotten during standard fine-tuning. Surprisingly, we find that first distilling a human made rule based sentiment engine into a recurrent neural network and then integrating the knowledge with the target task data leads to a substantial gain in generalization performance. Our experiments demonstrate the power of multi-source transfer techniques in practical text analytics problems when paired with distillation. In particular, for the SemEval 2016 Task 4 Subtask A (Nakov et al., 2016) dataset we surpass the state of the art established during the competition with a comparatively simple model architecture that is not even competitive when trained on only the labeled task specific data.", "keywords": "Deep learning;Transfer Learning;Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Matthew Riemer;Elham Khabiri;Richard Goodwin", "authorids": "mdriemer@us.ibm.com;ekhabiri@us.ibm.com;rgoodwin@us.ibm.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nriemer2017representation,\ntitle={Representation Stability as a Regularizer for Improved Text Analytics Transfer Learning},\nauthor={Matthew Riemer and Elham Khabiri and Richard Goodwin},\nyear={2017},\nurl={https://openreview.net/forum?id=HyenWc5gx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyenWc5gx", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6962820622885997368&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HyoST_9xl", "title": "DSD: Dense-Sparse-Dense Training for Deep Neural Networks", "track": "main", "status": "Poster", "tldr": "DSD effectively achieves superior optimization performance on a wide range of deep neural networks.", "abstract": "Modern deep neural networks have a large number of parameters, making them very hard to train. We propose DSD, a dense-sparse-dense training flow, for regularizing deep neural networks and achieving better optimization performance. In the first D (Dense) step, we train a dense network to learn connection weights and importance. In the S (Sparse) step, we regularize the network by pruning the unimportant connections with small weights and retraining the network given the sparsity constraint. In the final D (re-Dense) step, we increase the model capacity by removing the sparsity constraint, re-initialize the pruned parameters from zero and retrain the whole dense network. Experiments show that DSD training can improve the performance for a wide range of CNNs, RNNs and LSTMs on the tasks of image classification, caption generation and speech recognition. On ImageNet, DSD improved the Top1 accuracy of GoogLeNet by 1.1%, VGG-16 by 4.3%, ResNet-18 by 1.2% and ResNet-50 by 1.1%, respectively. On the WSJ\u201993 dataset, DSD improved DeepSpeech and DeepSpeech2 WER by 2.0% and 1.1%. On the Flickr-8K dataset, DSD improved the NeuralTalk BLEU score by over 1.7. DSD is easy to use in practice: at training time, DSD incurs only one extra hyper-parameter: the sparsity ratio in the S step. At testing time, DSD doesn\u2019t change the network architecture or incur any inference overhead. The consistent and significant performance gain of DSD experiments shows the inadequacy of the current training methods for finding the best local optimum, while DSD effectively achieves superior optimization performance for finding a better solution. DSD models are available to download at https://songhan.github.io/DSD.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Song Han;Jeff Pool;Sharan Narang;Huizi Mao;Enhao Gong;Shijian Tang;Erich Elsen;Peter Vajda;Manohar Paluri;John Tran;Bryan Catanzaro;William J. Dally", "authorids": "songhan@stanford.edu;jpool@nvidia.com;sharan@baidu.com;huizi@stanford.edu;enhaog@stanford.edu;sjtang@stanford.edu;eriche@google.com;vajdap@fb.com;mano@fb.com;johntran@nvidia.com;bcatanzaro@nvidia.com;dally@stanford.edu", "gender": ";;;;;;;;;;;", "homepage": ";;;;;;;;;;;", "dblp": ";;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;", "orcid": ";;;;;;;;;;;", "linkedin": ";;;;;;;;;;;", "or_profile": ";;;;;;;;;;;", "aff": ";;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;", "position": ";;;;;;;;;;;", "bibtex": "@inproceedings{\nhan2017dsd,\ntitle={{DSD}: Dense-Sparse-Dense Training for Deep Neural Networks},\nauthor={Song Han and Jeff Pool and Sharan Narang and Huizi Mao and Enhao Gong and Shijian Tang and Erich Elsen and Peter Vajda and Manohar Paluri and John Tran and Bryan Catanzaro and William J. Dally},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HyoST_9xl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyoST_9xl", "pdf_size": 0, "rating": "5;8;8", "confidence": "4;3;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 12, "corr_rating_confidence": -1.0, "gs_citation": 265, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11418339766692426485&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "Hyq4yhile", "title": "Learning Invariant Feature Spaces to Transfer Skills with Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "Learning a common feature space between robots with different morphology or actuation to transfer skills.", "abstract": "People can learn a wide range of tasks from their own experience, but can also learn from observing other creatures. This can accelerate acquisition of new skills even when the observed agent differs substantially from the learning agent in terms of morphology. In this paper, we examine how reinforcement learning algorithms can transfer knowledge between morphologically different agents (e.g., different robots). We introduce a problem formulation where twp agents are tasked with learning multiple skills by sharing information. Our method uses the skills that were learned by both agents to train invariant feature spaces that can then be used to transfer other skills from one agent to another. The process of learning these invariant feature spaces can be viewed as a kind of ``analogy making,'' or implicit learning of partial correspondences between two distinct domains. We evaluate our transfer learning algorithm in two simulated robotic manipulation skills, and illustrate that we can transfer knowledge between simulated robotic arms with different numbers of links, as well as simulated arms with different actuation mechanisms, where one robot is torque-driven while the other is tendon-driven.", "keywords": "Deep learning;Reinforcement Learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Abhishek Gupta;Coline Devin;YuXuan Liu;Pieter Abbeel;Sergey Levine", "authorids": "abhigupta@berkeley.edu;coline@berkeley.edu;yuxuanliu@berkeley.edu;pabbeel@cs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngupta2017learning,\ntitle={Learning Invariant Feature Spaces to Transfer Skills with Reinforcement Learning},\nauthor={Abhishek Gupta and Coline Devin and YuXuan Liu and Pieter Abbeel and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Hyq4yhile}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=Hyq4yhile", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;3", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 369, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7799444895009764859&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HysBZSqlx", "title": "Playing SNES in the Retro Learning Environment", "track": "main", "status": "Reject", "tldr": "Investigating Deep Reinforcement Learning algorithms in a new framework based on the SNES gaming console", "abstract": "Mastering a video game requires skill, tactics and strategy. While these attributes may be acquired naturally by human players, teaching them to a computer program is a far more challenging task. In recent years, extensive research was carriedout in the field of reinforcement learning and numerous algorithms were introduced, aiming to learn how to perform human tasks such as playing video games. As a result, the Arcade Learning Environment (ALE) (Bellemare et al., 2013) has become a commonly used benchmark environment allowing algorithms to train on various Atari 2600 games. In many games the state-of-the-art algorithms outperform humans. In this paper we introduce a new learning environment, the Retro Learning Environment \u2014 RLE, that can run games on the Super Nintendo Entertainment System (SNES), Sega Genesis and several other gaming consoles. The environment is expandable, allowing for more video games and consoles to be easily added to the environment, while maintaining the same interface as ALE. Moreover, RLE is compatible with Python and Torch. SNES games pose a significant challenge to current algorithms due to their higher level of complexity and versatility.", "keywords": "Reinforcement Learning;Deep learning;Games", "primary_area": "", "supplementary_material": "", "author": "Nadav Bhonker;Shai Rozenberg;Itay Hubara", "authorids": "nadavbh@tx.technion.ac.il;shairoz@tx.technion.ac.il;itayhubara@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbhonker2017playing,\ntitle={Playing {SNES} in the Retro Learning Environment},\nauthor={Nadav Bhonker and Shai Rozenberg and Itay Hubara},\nyear={2017},\nurl={https://openreview.net/forum?id=HysBZSqlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HysBZSqlx", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3936309065596022767&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "Hyvw0L9el", "title": "Generating Interpretable Images with Controllable Structure", "track": "main", "status": "Workshop", "tldr": "Autoregressive text-to-image synthesis with controllable spatial structure.", "abstract": "We demonstrate improved text-to-image synthesis with controllable object locations using an extension of Pixel Convolutional Neural Networks (PixelCNN). In addition to conditioning on text, we show how the model can generate images conditioned on part keypoints and segmentation masks. The character-level text encoder and image generation network are jointly trained end-to-end via maximum likelihood. We establish quantitative baselines in terms of text and structure-conditional pixel log-likelihood for three data sets: Caltech-UCSD Birds (CUB), MPII Human Pose (MHP), and Common Objects in Context (MS-COCO).", "keywords": "Deep learning;Computer vision;Multi-modal learning;Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Scott Reed;A\u00e4ron van den Oord;Nal Kalchbrenner;Victor Bapst;Matt Botvinick;Nando de Freitas", "authorids": "reedscot@google.com;avdnoord@google.com;nalk@google.com;vbapst@google.com;botvinick@google.com;nandodefreitas@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hyvw0L9el", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12986194106056902375&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HyxQzBceg", "title": "Deep Variational Information Bottleneck", "track": "main", "status": "Poster", "tldr": "Applying the information bottleneck to deep networks using the variational lower bound and reparameterization trick.", "abstract": "We present a variational approximation to the information bottleneck of Tishby et al. (1999). This variational approach allows us to parameterize the information bottleneck model using a neural network and leverage the reparameterization trick for efficient training. We call this method \u201cDeep Variational Information Bottleneck\u201d, or Deep VIB. We show that models trained with the VIB objective outperform those that are trained with other forms of regularization, in terms of generalization performance and robustness to adversarial attack.", "keywords": "Theory;Computer vision;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Alexander A. Alemi;Ian Fischer;Joshua V. Dillon;Kevin Murphy", "authorids": "alemi@google.com;iansf@google.com;jvdillon@google.com;kpmurphy@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nalemi2017deep,\ntitle={Deep Variational Information Bottleneck},\nauthor={Alexander A. Alemi and Ian Fischer and Joshua V. Dillon and Kevin Murphy},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=HyxQzBceg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=HyxQzBceg", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 17, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 2223, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7425625104303674821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "S11KBYclx", "title": "Learning Curve Prediction with Bayesian Neural Networks", "track": "main", "status": "Poster", "tldr": "We present a general probabilistic method based on Bayesian neural networks to predit learning curves of iterative machine learning methods.", "abstract": "Different neural network architectures, hyperparameters and training protocols lead to different performances as a function of time.\nHuman experts routinely inspect the resulting learning curves to quickly terminate runs with poor hyperparameter settings and thereby considerably speed up manual hyperparameter optimization. Exploiting the same information in automatic Bayesian hyperparameter optimization requires a probabilistic model of learning curves across hyperparameter settings. Here, we study the use of Bayesian neural networks for this purpose and improve their performance by a specialized learning curve layer.", "keywords": "Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Aaron Klein;Stefan Falkner;Jost Tobias Springenberg;Frank Hutter", "authorids": "kleinaa@cs.uni-freiburg.de;sfalkner@cs.uni-freiburg.de;springj@cs.uni-freiburg.de;fh@cs.uni-freiburg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nklein2017learning,\ntitle={Learning Curve Prediction with Bayesian Neural Networks},\nauthor={Aaron Klein and Stefan Falkner and Jost Tobias Springenberg and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S11KBYclx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S11KBYclx", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;5;4", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 301, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11158771144527282334&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "S13wCE9xx", "title": "Riemannian Optimization for Skip-Gram Negative Sampling", "track": "main", "status": "Reject", "tldr": "We train word embeddings optimizing Skip-Gram Negative Sampling objective (known by word2vec) via Riemannian low-rank optimization framework", "abstract": "Skip-Gram Negative Sampling (SGNS) word embedding model, well known by its implementation in \"word2vec\" software, is usually optimized by stochastic gradient descent. It can be shown that optimizing for SGNS objective can be viewed as an optimization problem of searching for a good matrix with the low-rank constraint. The most standard way to solve this type of problems is to apply Riemannian optimization framework to optimize the SGNS objective over the manifold of required low-rank matrices. In this paper, we propose an algorithm that optimizes SGNS objective using Riemannian optimization and demonstrates its superiority over popular competitors, such as the original method to train SGNS and SVD over SPPMI matrix.", "keywords": "Natural language processing;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Alexander Fonarev;Alexey Grinchuk;Gleb Gusev;Pavel Serdyukov;Ivan Oseledets", "authorids": "newo@newo.su;oleksii.hrinchuk@skolkovotech.ru;gleb57@yandex-team.ru;pavser@yandex-team.ru;ioseledets@skoltech.ru", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nfonarev2017riemannian,\ntitle={Riemannian Optimization for Skip-Gram Negative Sampling},\nauthor={Alexander Fonarev and Alexey Grinchuk and Gleb Gusev and Pavel Serdyukov and Ivan Oseledets},\nyear={2017},\nurl={https://openreview.net/forum?id=S13wCE9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S13wCE9xx", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;3", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5089310440440529890&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "S19eAF9ee", "title": "Structured Sequence Modeling with Graph Convolutional Recurrent Networks", "track": "main", "status": "Reject", "tldr": "This paper introduces a neural network to model graph-structured sequences", "abstract": "This paper introduces Graph Convolutional Recurrent Network (GCRN), a deep learning model able to predict structured sequences of data. Precisely, GCRN is a generalization of classical recurrent neural networks (RNN) to data structured by any arbitrary graph. Such structured sequences can be series of frames in videos, spatio-temporal measurements on a network of sensors, or random walks on a vocabulary graph for natural language modeling.The proposed model combines convolutional neural networks (CNN) on graphs to identify spatial structures and RNN to find dynamic patterns. We study two possible architectures of GCRN, and apply the models to two practical problems: predicting moving MNIST data, and modeling natural language with the Penn Treebank dataset. Experiments show that exploiting simultaneously graph spatial and dynamic information about data can improve both precision and learning speed.", "keywords": "Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Youngjoo Seo;Micha\u00ebl Defferrard;Pierre Vandergheynst;Xavier Bresson", "authorids": "youngjoo.seo@epfl.ch;michael.defferrard@epfl.ch;pierre.vandergheynst@epfl.ch;xavier.bresson@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nseo2017structured,\ntitle={Structured Sequence Modeling with Graph Convolutional Recurrent Networks},\nauthor={Youngjoo Seo and Micha{\\\"e}l Defferrard and Pierre Vandergheynst and Xavier Bresson},\nyear={2017},\nurl={https://openreview.net/forum?id=S19eAF9ee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S19eAF9ee", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17586895734361677328&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "S1AG8zYeg", "title": "Sentence Ordering using Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "We consider the problem of organizing a given collection of sentences into a coherent order.", "abstract": "Modeling the structure of coherent texts is a task of great importance in NLP. The task of organizing a given set of sentences into a coherent order has been\ncommonly used to build and evaluate models that understand such structure. In this work we propose an end-to-end neural approach based on the recently proposed\nset to sequence mapping framework to address the sentence ordering problem. Our model achieves state-of-the-art performance in the order discrimination task\non two datasets widely used in the literature. We also consider a new interesting task of ordering abstracts from conference papers and research proposals and\ndemonstrate strong performance against recent methods. Visualizing the sentence representations learned by the model shows that the model has captured high\nlevel logical structure in these paragraphs. The model also learns rich semantic sentence representations by learning to order texts, performing comparably to\nrecent unsupervised representation learning methods in the sentence similarity and paraphrase detection tasks.", "keywords": "Natural language processing;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Lajanugen Logeswaran;Honglak Lee;Dragomir Radev", "authorids": "llajan@umich.edu;honglak@eecs.umich.edu;radev@umich.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlogeswaran2017sentence,\ntitle={Sentence Ordering using Recurrent Neural Networks},\nauthor={Lajanugen Logeswaran and Honglak Lee and Dragomir Radev},\nyear={2017},\nurl={https://openreview.net/forum?id=S1AG8zYeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1AG8zYeg", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3227080464710239209&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "S1Bb3D5gg", "title": "Learning End-to-End Goal-Oriented Dialog", "track": "main", "status": "Oral", "tldr": "A new open dataset and testbed for training and evaluating end-to-end dialog systems in goal-oriented scenarios.", "abstract": "Traditional dialog systems used in goal-oriented applications require a lot of domain-specific handcrafting, which hinders scaling up to new domains. End- to-end dialog systems, in which all components are trained from the dialogs themselves, escape this limitation. But the encouraging success recently obtained in chit-chat dialog may not carry over to goal-oriented settings. This paper proposes a testbed to break down the strengths and shortcomings of end-to-end dialog systems in goal-oriented applications. Set in the context of restaurant reservation, our tasks require manipulating sentences and symbols, so as to properly conduct conversations, issue API calls and use the outputs of such calls. We show that an end-to-end dialog system based on Memory Networks can reach promising, yet imperfect, performance and learn to perform non-trivial operations. We confirm those results by comparing our system to a hand-crafted slot-filling baseline on data from the second Dialog State Tracking Challenge (Henderson et al., 2014a). We show similar result patterns on data extracted from an online concierge service.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Antoine Bordes;Y-Lan Boureau;Jason Weston", "authorids": "abordes@fb.com;ylan@fb.com;jase@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbordes2017learning,\ntitle={Learning End-to-End Goal-Oriented Dialog},\nauthor={Antoine Bordes and Y-Lan Boureau and Jason Weston},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1Bb3D5gg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1Bb3D5gg", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;5", "rating_avg": 7.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1004, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14624523216814088198&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "S1Bm3T_lg", "title": "Compositional Kernel Machines", "track": "main", "status": "Workshop", "tldr": "We propose a kernel method that combats the curse of dimensionality with an exponential number of virtual training instances efficiently composed from transformed sub-regions of the original ones.", "abstract": "Convolutional neural networks (convnets) have achieved impressive results on recent computer vision benchmarks. While they benefit from multiple layers that encode nonlinear decision boundaries and a degree of translation invariance, training convnets is a lengthy procedure fraught with local optima. Alternatively, a kernel method that incorporates the compositionality and symmetry of convnets could learn similar nonlinear concepts yet with easier training and architecture selection. We propose compositional kernel machines (CKMs), which effectively create an exponential number of virtual training instances by composing transformed sub-regions of the original ones. Despite this, CKM discriminant functions can be computed efficiently using ideas from sum-product networks. The ability to compose virtual instances in this way gives CKMs invariance to translations and other symmetries, and combats the curse of dimensionality. Just as support vector machines (SVMs) provided a compelling alternative to multilayer perceptrons when they were introduced, CKMs could become an attractive approach for object recognition and other vision problems. In this paper we define CKMs, explore their properties, and present promising results on NORB datasets. Experiments show that CKMs can outperform SVMs and be competitive with convnets in a number of dimensions, by learning symmetries and compositional concepts from fewer samples without data augmentation.", "keywords": "Computer vision;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Robert Gens;Pedro Domingos", "authorids": "rcg@cs.washington.edu;pedrod@cs.washington.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1Bm3T_lg", "pdf_size": 0, "rating": "5;5;5;6", "confidence": "4;4;3;4", "rating_avg": 5.25, "confidence_avg": 3.75, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6662342217182930982&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "S1HEBe_Jl", "title": "Learning to Protect Communications with Adversarial Neural Cryptography", "track": "main", "status": "Reject", "tldr": "Adversarial training of neural networks to learn rudimentary forms of encryption with no pre-specified algorithms", "abstract": "We ask whether neural networks can learn to use secret keys to protect\ninformation from other neural networks. Specifically, we focus on\nensuring confidentiality properties in a multiagent system, and we\nspecify those properties in terms of an adversary. Thus, a\nsystem may consist of neural networks named Alice and Bob, and we aim\nto limit what a third neural network named Eve learns from\neavesdropping on the communication between Alice and Bob.\nWe do not prescribe specific cryptographic algorithms to these neural networks;\ninstead, we train end-to-end, adversarially.\nWe demonstrate that the neural networks can learn \nhow to perform forms of encryption and decryption, and also\nhow to apply these operations selectively in order to meet\nconfidentiality goals.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mart\u00edn Abadi, David G. Andersen", "authorids": "", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nabadi2017learning,\ntitle={Learning to Protect Communications with Adversarial Neural Cryptography},\nauthor={Mart{\\'\\i}n Abadi and David G. Andersen},\nyear={2017},\nurl={https://openreview.net/forum?id=S1HEBe_Jl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1HEBe_Jl", "pdf_size": 0, "rating": "4;5;6", "confidence": "2;4;3", "rating_avg": 5.0, "confidence_avg": 3.0, "replies_avg": 12, "authors#_avg": 1, "corr_rating_confidence": 0.5, "gs_citation": 319, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6031431987684545223&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "S1HcOI5le", "title": "OMG: Orthogonal Method of Grouping With Application of K-Shot Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Training a classifier with only a few examples remains a significant barrier when using neural networks with large number of parameters. Though various specialized network architectures have been proposed for these k-shot learning tasks to avoid overfitting, a question remains: is there a generalizable framework for the k-shot learning problem that can leverage existing deep models as well as avoid model overfitting? In this paper, we proposed a generalizable k-shot learning framework that can be used on any pre-trained network, by grouping network parameters to produce a low-dimensional representation of the parameter space. The grouping of the parameters is based on an orthogonal decomposition of the parameter space. To avoid overfitting, groups of parameters will be updated together during the k-shot training process. Furthermore, this framework can be integrated with any existing popular deep neural networks such as VGG, GoogleNet, ResNet, without any changes in the original network structure or any sacrifices in performance. We evaluate our framework on a wide range of intra/inter-dataset k-shot learning tasks and show state-of-the-art performance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoqi Fan;Yu Zhang;Kris M. Kitani", "authorids": "haoqif@andrew.cmu.edu;;", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfan2017omg,\ntitle={{OMG}: Orthogonal Method of Grouping With Application of K-Shot Learning},\nauthor={Haoqi Fan and Yu Zhang and Kris M. Kitani},\nyear={2017},\nurl={https://openreview.net/forum?id=S1HcOI5le}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1HcOI5le", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iaGty7S56IgJ:scholar.google.com/&scioq=OMG:+Orthogonal+Method+of+Grouping+With+Application+of+K-Shot+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S1J0E-71l", "title": "Surprisal-Driven Feedback in Recurrent Networks", "track": "main", "status": "Reject", "tldr": "In this paper, we add surprisal as additional input to RNN , which take into account past error information when making new predictions. We extend SOTA on character-level language modelling, achieving 1.37 bits/char on wikipedia dataset.", "abstract": "Recurrent neural nets are widely used for predicting temporal data. Their inherent deep feedforward structure allows learning complex sequential patterns. It is believed that top-down feedback might be an important missing ingredient which in theory could help disambiguate similar patterns depending on broader context. In this paper, we introduce surprisal-driven recurrent networks, which take into account past error information when making new predictions. This is achieved by continuously monitoring the discrepancy between most recent predictions and the actual observations. Furthermore, we show that it outperforms other stochastic and fully deterministic approaches on enwik8 character level prediction task achieving 1.37 BPC.", "keywords": "Unsupervised Learning;Applications;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Kamil Rocki", "authorids": "", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nrocki2017surprisaldriven,\ntitle={Surprisal-Driven Feedback in Recurrent Networks},\nauthor={Kamil Rocki},\nyear={2017},\nurl={https://openreview.net/forum?id=S1J0E-71l}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1J0E-71l", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 1, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4199042952885278919&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "S1JG13oee", "title": "b-GAN: Unified Framework of Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "New Unified Framework of Generative Adversarial Networks using Bregman divergence beyond f-GAN", "abstract": "Generative adversarial networks (GANs) are successful deep generative models. They are based on a two-player minimax game. However, the objective function derived in the original motivation is changed to obtain stronger gradients when learning the generator. We propose a novel algorithm that repeats density ratio estimation and f-divergence minimization. Our algorithm offers a new unified perspective toward understanding GANs and is able to make use of multiple viewpoints obtained from the density ratio estimation research, e.g. what divergence is stable and relative density ratio is useful. ", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Masatosi Uehara;Issei Sato;Masahiro Suzuki;Kotaro Nakayama;Yutaka Matsuo", "authorids": "uehara-masatoshi136@g.ecc.u-tokyo.ac.jp;sato@k.u-tokyo.ac.jp;masa@weblab.t.u-tokyo.ac.jp;nakayama@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nuehara2017bgan,\ntitle={b-{GAN}: Unified Framework of Generative Adversarial Networks},\nauthor={Masatosi Uehara and Issei Sato and Masahiro Suzuki and Kotaro Nakayama and Yutaka Matsuo},\nyear={2017},\nurl={https://openreview.net/forum?id=S1JG13oee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=S1JG13oee", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;3;4", "rating_avg": 5.0, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6917740929258860238&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "S1Jhfftgx", "title": "Enforcing constraints on outputs with unconstrained inference", "track": "main", "status": "Reject", "tldr": "An inference method for enforcing hard constraints on the outputs of neural networks without combinatorial search, with applications in NLP and structured prediction.", "abstract": " Increasingly, practitioners apply neural networks to complex\n problems in natural language processing (NLP), such as syntactic\n parsing, that have rich output structures. Many such applications\n require deterministic constraints on the output values; for example,\n requiring that the sequential outputs encode a valid tree. While\n hidden units might capture such properties, the network is not\n always able to learn them from the training data alone, and\n practitioners must then resort to post-processing. In this paper, we\n present an inference method for neural networks that enforces\n deterministic constraints on outputs without performing\n post-processing or expensive discrete search over the feasible\n space. Instead, for each input, we nudge the continuous weights\n until the network's unconstrained inference procedure generates an\n output that satisfies the constraints. We find that our method\n reduces the number of violating outputs by up to 81\\%, while\n improving accuracy.", "keywords": "Natural language processing;Structured prediction;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Jay Yoon Lee;Michael L. Wick;Jean-Baptiste Tristan", "authorids": "lee.jayyoon@gmail.com;michael.wick@oracle.com;jean.baptiste.tristan@oracle.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017enforcing,\ntitle={Enforcing constraints on outputs with unconstrained inference},\nauthor={Jay Yoon Lee and Michael L. Wick and Jean-Baptiste Tristan},\nyear={2017},\nurl={https://openreview.net/forum?id=S1Jhfftgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1Jhfftgx", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;5", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13665417019967114729&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "S1LVSrcge", "title": "Variable Computation in Recurrent Neural Networks", "track": "main", "status": "Poster", "tldr": "We show that an RNN can learn to control the amount of computation it does at each time step, leading to better efficiency and performance as well as discovering time patterns of interest.", "abstract": "Recurrent neural networks (RNNs) have been used extensively and with increasing success to model various types of sequential data. Much of this progress has been achieved through devising recurrent units and architectures with the flexibility to capture complex statistics in the data, such as long range dependency or localized attention phenomena. However, while many sequential data (such as video, speech or language) can have highly variable information flow, most recurrent models still consume input features at a constant rate and perform a constant number of computations per time step, which can be detrimental to both speed and model capacity. In this paper, we explore a modification to existing recurrent units which allows them to learn to vary the amount of computation they perform at each step, without prior knowledge of the sequence's time structure. We show experimentally that not only do our models require fewer operations, they also lead to better performance overall on evaluation tasks.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Yacine Jernite;Edouard Grave;Armand Joulin;Tomas Mikolov", "authorids": "yacine.jernite@nyu.edu;egrave@fb.com;ajoulin@fb.com;tmikolov@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\njernite2017variable,\ntitle={Variable Computation in Recurrent Neural Networks},\nauthor={Yacine Jernite and Edouard Grave and Armand Joulin and Tomas Mikolov},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1LVSrcge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=S1LVSrcge", "pdf_size": 0, "rating": "4;7;7", "confidence": "5;4;4", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 18, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16932139306086204763&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "S1OufnIlx", "title": "Adversarial examples in the physical world", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Most existing machine learning classifiers are highly vulnerable to adversarial examples.\nAn adversarial example is a sample of input data which has been modified\nvery slightly in a way that is intended to cause a machine learning classifier\nto misclassify it.\nIn many cases, these modifications can be so subtle that a human observer does\nnot even notice the modification at all, yet the classifier still makes a mistake.\nAdversarial examples pose security concerns\nbecause they could be used to perform an attack on machine learning systems, even if the adversary has no\naccess to the underlying model.\nUp to now, all previous work has assumed a threat model in which the adversary can\nfeed data directly into the machine learning classifier.\nThis is not always the case for systems operating in the physical world,\nfor example those which are using signals from cameras and other sensors as input.\nThis paper shows that even in such physical world scenarios, machine learning systems are vulnerable\nto adversarial examples.\nWe demonstrate this by feeding adversarial images obtained from a cell-phone camera\nto an ImageNet Inception classifier and measuring the classification accuracy of the system.\nWe find that a large fraction of adversarial examples are classified incorrectly\neven when perceived through the camera.", "keywords": "Supervised Learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Alexey Kurakin;Ian J. Goodfellow;Samy Bengio", "authorids": "kurakin@google.com;ian@openai.com;bengio@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1OufnIlx", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 7407, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16342929405179799360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13 }, { "id": "S1QefL5ge", "title": "Online Structure Learning for Sum-Product Networks with Gaussian Leaves", "track": "main", "status": "Workshop", "tldr": "This paper describes the first online structure learning technique for continuous SPNs with Gaussian leaves.", "abstract": "Sum-product networks have recently emerged as an attractive representation due to their dual view as a special type of deep neural network with clear semantics and a special type of probabilistic graphical model for which inference is always tractable. Those properties follow from some conditions (i.e., completeness and decomposability) that must be respected by the structure of the network. As a result, it is not easy to specify a valid sum-product network by hand and therefore structure learning techniques are typically used in practice. This paper describes the first {\\em online} structure learning technique for continuous SPNs with Gaussian leaves. We also introduce an accompanying new parameter learning technique.", "keywords": "Unsupervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Wilson Hsu;Agastya Kalra;Pascal Poupart", "authorids": "wwhsu@uwaterloo.ca;a6kalra@uwaterloo.ca;ppoupart@uwaterloo.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer6;AnonReviewer4", "site": "https://openreview.net/forum?id=S1QefL5ge", "pdf_size": 0, "rating": "4;4;6", "confidence": "1;2;3", "rating_avg": 4.666666666666667, "confidence_avg": 2.0, "replies_avg": 20, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13931250777210671504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "S1RP6GLle", "title": "Amortised MAP Inference for Image Super-resolution", "track": "main", "status": "Oral", "tldr": "Probabilisticly motivated image superresolution using a projection to the subspace of valid solutions", "abstract": "Image super-resolution (SR) is an underdetermined inverse problem, where a large number of plausible high resolution images can explain the same downsampled image. Most current single image SR methods use empirical risk minimisation, often with a pixel-wise mean squared error (MSE) loss.\nHowever, the outputs from such methods tend to be blurry, over-smoothed and generally appear implausible. A more desirable approach would employ Maximum a Posteriori (MAP) inference, preferring solutions that always have a high probability under the image prior, and thus appear more plausible. Direct MAP estimation for SR is non-trivial, as it requires us to build a model for the image prior from samples. Here we introduce new methods for \\emph{amortised MAP inference} whereby we calculate the MAP estimate directly using a convolutional neural network. We first introduce a novel neural network architecture that performs a projection to the affine subspace of valid SR solutions ensuring that the high resolution output of the network is always consistent with the low resolution input. We show that, using this architecture, the amortised MAP inference problem reduces to minimising the cross-entropy between two distributions, similar to training generative models. We propose three methods to solve this optimisation problem: (1) Generative Adversarial Networks (GAN) (2) denoiser-guided SR which backpropagates gradient-estimates from denoising to train the network, and (3) a baseline method using a maximum-likelihood-trained image prior. Our experiments show that the GAN based approach performs best on real image data. Lastly, we establish a connection between GANs and amortised variational inference as in e.g. variational autoencoders.", "keywords": "Theory;Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Casper Kaae S\u00f8nderby;Jose Caballero;Lucas Theis;Wenzhe Shi;Ferenc Husz\u00e1r", "authorids": "casperkaae@gmail.com;jcaballero@twitter.com;ltheis@twitter.com;wshi@twitter.com;fhuszar@twitter.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ns{\\o}nderby2017amortised,\ntitle={Amortised {MAP} Inference for Image Super-resolution},\nauthor={Casper Kaae S{\\o}nderby and Jose Caballero and Lucas Theis and Wenzhe Shi and Ferenc Husz{\\'a}r},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1RP6GLle}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1RP6GLle", "pdf_size": 0, "rating": "7;8;9", "confidence": "2;5;3", "rating_avg": 8.0, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.32732683535398854, "gs_citation": 538, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14286004747394736744&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "S1TER2oll", "title": "FILTER SHAPING FOR CONVOLUTIONAL NEURAL NETWORKS", "track": "main", "status": "Poster", "tldr": "", "abstract": "Convolutional neural networks (CNNs) are powerful tools for classification of visual inputs. An important property of CNN is its restriction to local connections and sharing of local weights among different locations. In this paper, we consider the definition of appropriate local neighborhoods in CNN. We provide a theoretical analysis that justifies the traditional square filter used in CNN for analyzing natural images. The analysis also provides a principle for designing customized filter shapes for application domains that do not resemble natural images. We propose an approach that automatically designs multiple layers of different customized filter shapes by repeatedly solving lasso problems. It is applied to customize the filter shape for both bioacoustic applications and gene sequence analysis applications. In those domains with small sample sizes we demonstrate that the customized filters achieve superior classification accuracy, improved convergence behavior in training and reduced sensitivity to hyperparameters.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xingyi Li;Fuxin Li;Xiaoli Fern;Raviv Raich", "authorids": "lixin@eecs.oregonstate.edu;lif@eecs.oregonstate.edu;xfern@eecs.oregonstate.edu;raich@eecs.oregonstate.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nli2017filter,\ntitle={{FILTER} {SHAPING} {FOR} {CONVOLUTIONAL} {NEURAL} {NETWORKS}},\nauthor={Xingyi Li and Fuxin Li and Xiaoli Fern and Raviv Raich},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1TER2oll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1TER2oll", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9930175089977987060&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2 }, { "id": "S1VaB4cex", "title": "FractalNet: Ultra-Deep Neural Networks without Residuals", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce a design strategy for neural network macro-architecture based on self-similarity. Repeated application of a simple expansion rule generates deep networks whose structural layouts are precisely truncated fractals. These networks contain interacting subpaths of different lengths, but do not include any pass-through or residual connections; every internal signal is transformed by a filter and nonlinearity before being seen by subsequent layers. In experiments, fractal networks match the excellent performance of standard residual networks on both CIFAR and ImageNet classification tasks, thereby demonstrating that residual representations may not be fundamental to the success of extremely deep convolutional neural networks. Rather, the key may be the ability to transition, during training, from effectively shallow to deep. We note similarities with student-teacher behavior and develop drop-path, a natural extension of dropout, to regularize co-adaptation of subpaths in fractal architectures. Such regularization allows extraction of high-performance fixed-depth subnetworks. Additionally, fractal networks exhibit an anytime property: shallow subnetworks provide a quick answer, while deeper subnetworks, with higher latency, provide a more accurate answer.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gustav Larsson;Michael Maire;Gregory Shakhnarovich", "authorids": "larsson@cs.uchicago.edu;mmaire@ttic.edu;greg@ttic.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlarsson2017fractalnet,\ntitle={FractalNet: Ultra-Deep Neural Networks without Residuals},\nauthor={Gustav Larsson and Michael Maire and Gregory Shakhnarovich},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1VaB4cex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1VaB4cex", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1321, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15300779753326541860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "S1X7nhsxl", "title": "Improving Generative Adversarial Networks with Denoising Feature Matching", "track": "main", "status": "Poster", "tldr": "Use a denoiser trained on discriminator features to train better generators.", "abstract": "We propose an augmented training procedure for generative adversarial networks designed to address shortcomings of the original by directing the generator towards probable configurations of abstract discriminator features. We estimate and track the distribution of these features, as computed from data, with a denoising auto-encoder, and use it to propose high-level targets for the generator. We combine this new loss with the original and evaluate the hybrid criterion on the task of unsupervised image synthesis from datasets comprising a diverse set of visual categories, noting a qualitative and quantitative improvement in the ``objectness'' of the resulting samples.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "David Warde-Farley;Yoshua Bengio", "authorids": "d.warde.farley@gmail.com;yoshua.umontreal@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nwarde-farley2017improving,\ntitle={Improving Generative Adversarial Networks with Denoising Feature Matching},\nauthor={David Warde-Farley and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1X7nhsxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1X7nhsxl", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;4;5", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 17, "authors#_avg": 2, "corr_rating_confidence": 0.9449111825230683, "gs_citation": 185, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12923668823698343528&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "S1Y0td9ee", "title": "Shift Aggregate Extract Networks", "track": "main", "status": "Workshop", "tldr": "Shift Aggregate Extract Networks for learning on social network data", "abstract": "The Shift Aggregate Extract Network SAEN is an architecture for learning representations on social network data.\nSAEN decomposes input graphs into hierarchies made of multiple strata of objects.\nVector representations of each object are learnt by applying 'shift', 'aggregate' and 'extract' operations on the vector representations of its parts.\nWe propose an algorithm for domain compression which takes advantage of symmetries in hierarchical decompositions to reduce the memory usage and obtain significant speedups.\nOur method is empirically evaluated on real world social network datasets, outperforming the current state of the art.", "keywords": "Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Francesco Orsini;Daniele Baracchi;Paolo Frasconi", "authorids": "francesco.orsini@kuleuven.be;daniele.baracchi@unifi.it;paolo.frasconi@unifi.it", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1Y0td9ee", "pdf_size": 0, "rating": "3;5;5", "confidence": "2;3;3", "rating_avg": 4.333333333333333, "confidence_avg": 2.6666666666666665, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14478546218001990087&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "S1_pAu9xl", "title": "Trained Ternary Quantization", "track": "main", "status": "Poster", "tldr": "Ternary Neural Network with accuracy close to or even higher than the full-precision one", "abstract": "Deep neural networks are widely used in machine learning applications. However, the deployment of large neural networks models can be difficult to deploy on mobile devices with limited power budgets. To solve this problem, we propose Trained Ternary Quantization (TTQ), a method that can reduce the precision of weights in neural networks to ternary values. This method has very little accuracy degradation and can even improve the accuracy of some models (32, 44, 56-layer ResNet) on CIFAR-10 and AlexNet on ImageNet. And our AlexNet model is trained from scratch, which means it\u2019s as easy as to train normal full precision model. We highlight our trained quantization method that can learn both ternary values and ternary assignment. During inference, only ternary values (2-bit weights) and scaling factors are needed, therefore our models are nearly 16\u00d7 smaller than full- precision models. Our ternary models can also be viewed as sparse binary weight networks, which can potentially be accelerated with custom circuit. Experiments on CIFAR-10 show that the ternary models obtained by trained quantization method outperform full-precision models of ResNet-32,44,56 by 0.04%, 0.16%, 0.36%, respectively. On ImageNet, our model outperforms full-precision AlexNet model by 0.3% of Top-1 accuracy and outperforms previous ternary models by 3%.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Chenzhuo Zhu;Song Han;Huizi Mao;William J. Dally", "authorids": "zhucz13@mails.tsinghua.edu.cn;songhan@stanford.edu;huizi@stanford.edu;dally@stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzhu2017trained,\ntitle={Trained Ternary Quantization},\nauthor={Chenzhuo Zhu and Song Han and Huizi Mao and William J. Dally},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1_pAu9xl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1_pAu9xl", "pdf_size": 0, "rating": "3;7;7;8", "confidence": "3;5;3;5", "rating_avg": 6.25, "confidence_avg": 4.0, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": 0.6509445549041193, "gs_citation": 1391, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12956735651240747861&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "S1c2cvqee", "title": "Designing Neural Network Architectures using Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "A Q-learning algorithm for automatically generating neural nets", "abstract": "At present, designing convolutional neural network (CNN) architectures requires both human expertise and labor. New architectures are handcrafted by careful experimentation or modified from a handful of existing networks. We introduce MetaQNN, a meta-modeling algorithm based on reinforcement learning to automatically generate high-performing CNN architectures for a given learning task. The learning agent is trained to sequentially choose CNN layers using $Q$-learning with an $\\epsilon$-greedy exploration strategy and experience replay. The agent explores a large but finite space of possible architectures and iteratively discovers designs with improved performance on the learning task. On image classification benchmarks, the agent-designed networks (consisting of only standard convolution, pooling, and fully-connected layers) beat existing networks designed with the same layer types and are competitive against the state-of-the-art methods that use more complex layer types. We also outperform existing meta-modeling approaches for network design on image classification tasks.", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Bowen Baker;Otkrist Gupta;Nikhil Naik;Ramesh Raskar", "authorids": "bowen@mit.edu;otkrist@mit.edu;naik@mit.edu;raskar@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbaker2017designing,\ntitle={Designing Neural Network Architectures using Reinforcement Learning},\nauthor={Bowen Baker and Otkrist Gupta and Nikhil Naik and Ramesh Raskar},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1c2cvqee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1c2cvqee", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 19, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1983, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1457104897417222523&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "S1dIzvclg", "title": "A recurrent neural network without chaos", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce an exceptionally simple gated recurrent neural network (RNN) that achieves performance comparable to well-known gated architectures, such as LSTMs and GRUs, on the word-level language modeling task. We prove that our model has simple, predicable and non-chaotic dynamics. This stands in stark contrast to more standard gated architectures, whose underlying dynamical systems exhibit chaotic behavior.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Thomas Laurent;James von Brecht", "authorids": "tlaurent@lmu.edu;james.vonbrecht@csulb.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nlaurent2017a,\ntitle={A recurrent neural network without chaos},\nauthor={Thomas Laurent and James von Brecht},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1dIzvclg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1dIzvclg", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11945525315252842430&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "S1di0sfgl", "title": "Hierarchical Multiscale Recurrent Neural Networks", "track": "main", "status": "Poster", "tldr": "Propose a recurrent neural network architecture that can discover the underlying hierarchical structure in the temporal data.", "abstract": "Learning both hierarchical and temporal representation has been among the long- standing challenges of recurrent neural networks. Multiscale recurrent neural networks have been considered as a promising approach to resolve this issue, yet there has been a lack of empirical evidence showing that this type of models can actually capture the temporal dependencies by discovering the latent hierarchical structure of the sequence. In this paper, we propose a novel multiscale approach, called the hierarchical multiscale recurrent neural network, that can capture the latent hierarchical structure in the sequence by encoding the temporal dependencies with different timescales using a novel update mechanism. We show some evidence that the proposed model can discover underlying hierarchical structure in the sequences without using explicit boundary information. We evaluate our proposed model on character-level language modelling and handwriting sequence generation.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Junyoung Chung;Sungjin Ahn;Yoshua Bengio", "authorids": "junyoung.chung@umontreal.ca;sungjin.ahn@umontreal.ca;yoshua.bengio@umontreal.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchung2017hierarchical,\ntitle={Hierarchical Multiscale Recurrent Neural Networks},\nauthor={Junyoung Chung and Sungjin Ahn and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1di0sfgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=S1di0sfgl", "pdf_size": 0, "rating": "7;8;8", "confidence": "3;4;4", "rating_avg": 7.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 690, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3631406206229660252&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "S1j4RqYxg", "title": "Efficient Calculation of Polynomial Features on Sparse Matrices", "track": "main", "status": "Reject", "tldr": "An algorithm to perform polynomial expansions on CSR matrices that scales with matrix density polynomially.", "abstract": "We provide an algorithm for polynomial feature expansion that both operates on\nand produces a compressed sparse row matrix without any densification. For a\nvector of dimension D, density d, and degree k the algorithm has time complexity\nO(d^k * D^k) where k is the polynomial-feature order; this is an improvement by a factor d^k\nover the standard method.", "keywords": "Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Andrew Nystrom;John Hughes", "authorids": "awnystrom@gmail.com;jfh@cs.brown.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnystrom2017efficient,\ntitle={Efficient Calculation of Polynomial Features on Sparse Matrices},\nauthor={Andrew Nystrom and John Hughes},\nyear={2017},\nurl={https://openreview.net/forum?id=S1j4RqYxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1j4RqYxg", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;1", "rating_avg": 3.0, "confidence_avg": 2.3333333333333335, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_XeTPsz-mXQJ:scholar.google.com/&scioq=Efficient+Calculation+of+Polynomial+Features+on+Sparse+Matrices&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "id": "S1jE5L5gl", "title": "The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables", "track": "main", "status": "Poster", "tldr": "Relaxed reparameterization trick for discrete stochastic units.", "abstract": "The reparameterization trick enables optimizing large scale stochastic computation graphs via gradient descent. The essence of the trick is to refactor each stochastic node into a differentiable function of its parameters and a random variable with fixed distribution. After refactoring, the gradients of the loss propagated by the chain rule through the graph are low variance unbiased estimators of the gradients of the expected loss. While many continuous random variables have such reparameterizations, discrete random variables lack useful reparameterizations due to the discontinuous nature of discrete states. In this work we introduce Concrete random variables -- continuous relaxations of discrete random variables. The Concrete distribution is a new family of distributions with closed form densities and a simple reparameterization. Whenever a discrete stochastic node of a computation graph can be refactored into a one-hot bit representation that is treated continuously, Concrete stochastic nodes can be used with automatic differentiation to produce low-variance biased gradients of objectives (including objectives that depend on the log-probability of latent stochastic nodes) on the corresponding discrete graph. We demonstrate the effectiveness of Concrete relaxations on density estimation and structured prediction tasks using neural networks.\n", "keywords": "Deep learning;Unsupervised Learning;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Chris J. Maddison;Andriy Mnih;Yee Whye Teh", "authorids": "cmaddis@stats.ox.ac.uk;amnih@google.com;y.w.teh@stats.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmaddison2017the,\ntitle={The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables},\nauthor={Chris J. Maddison and Andriy Mnih and Yee Whye Teh},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1jE5L5gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1jE5L5gl", "pdf_size": 0, "rating": "7;8;9", "confidence": "3;4;5", "rating_avg": 8.0, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 1.0, "gs_citation": 3092, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16482228288411412158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "S1jmAotxg", "title": "Stick-Breaking Variational Autoencoders", "track": "main", "status": "Poster", "tldr": "We define a variational autoencoder variant with stick-breaking latent variables thereby giving it adaptive width.", "abstract": "We extend Stochastic Gradient Variational Bayes to perform posterior inference for the weights of Stick-Breaking processes. This development allows us to define a Stick-Breaking Variational Autoencoder (SB-VAE), a Bayesian nonparametric version of the variational autoencoder that has a latent representation with stochastic dimensionality. We experimentally demonstrate that the SB-VAE, and a semi-supervised variant, learn highly discriminative latent representations that often outperform the Gaussian VAE\u2019s.", "keywords": "Deep learning;Unsupervised Learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Eric Nalisnick;Padhraic Smyth", "authorids": "enalisni@uci.edu;smyth@ics.uci.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nnalisnick2017stickbreaking,\ntitle={Stick-Breaking Variational Autoencoders},\nauthor={Eric Nalisnick and Padhraic Smyth},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1jmAotxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1jmAotxg", "pdf_size": 0, "rating": "4;8;8", "confidence": "4;4;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 22, "authors#_avg": 2, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 194, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3993746898648797152&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "S1oWlN9ll", "title": "Loss-aware Binarization of Deep Networks", "track": "main", "status": "Poster", "tldr": "", "abstract": "Deep neural network models, though very powerful and highly successful, are computationally expensive in terms of space and time. Recently, there have been a number of attempts on binarizing the network weights and activations. This greatly reduces the network size, and replaces the underlying multiplications to additions or even XNOR bit operations. However, existing binarization schemes are based on simple matrix approximations and ignore the effect of binarization on the loss. In this paper, we propose a proximal Newton algorithm with diagonal Hessian approximation that directly minimizes the loss w.r.t. the binarized weights. The underlying proximal step has an efficient closed-form solution, and the second-order information can be efficiently obtained from the second moments already computed by the Adam optimizer. Experiments on both feedforward and recurrent networks show that the proposed loss-aware binarization algorithm outperforms existing binarization schemes, and is also more robust for wide and deep networks.", "keywords": "Deep learning;Applications;Optimization", "primary_area": "", "supplementary_material": "", "author": "Lu Hou;Quanming Yao;James T. Kwok", "authorids": "lhouab@cse.ust.hk;qyaoaa@cse.ust.hk;jamesk@cse.ust.hk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhou2017lossaware,\ntitle={Loss-aware Binarization of Deep Networks},\nauthor={Lu Hou and Quanming Yao and James T. Kwok},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=S1oWlN9ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1oWlN9ll", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 21, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 272, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15694355493711957168&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13 }, { "id": "S1vyujVye", "title": "Deep unsupervised learning through spatial contrasting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Convolutional networks have marked their place over the last few years as the\nbest performing model for various visual tasks. They are, however, most suited\nfor supervised learning from large amounts of labeled data. Previous attempts\nhave been made to use unlabeled data to improve model performance by applying\nunsupervised techniques. These attempts require different architectures and training methods.\nIn this work we present a novel approach for unsupervised training\nof Convolutional networks that is based on contrasting between spatial regions\nwithin images. This criterion can be employed within conventional neural net-\nworks and trained using standard techniques such as SGD and back-propagation,\nthus complementing supervised methods.", "keywords": "Unsupervised Learning;Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Elad Hoffer;Itay Hubara;Nir Ailon", "authorids": "ehoffer@tx.technion.ac.il;itayh@tx.technion.ac.il;nailon@cs.technion.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhoffer2017deep,\ntitle={Deep unsupervised learning through spatial contrasting},\nauthor={Elad Hoffer and Itay Hubara and Nir Ailon},\nyear={2017},\nurl={https://openreview.net/forum?id=S1vyujVye}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1vyujVye", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5871567940732623771&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "S1xh5sYgx", "title": "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size", "track": "main", "status": "Reject", "tldr": "Small CNN models", "abstract": "Recent research on deep neural networks has focused primarily on improving accuracy. For a given accuracy level, it is typically possible to identify multiple DNN architectures that achieve that accuracy level. With equivalent accuracy, smaller DNN architectures offer at least three advantages: (1) Smaller DNNs require less communication across servers during distributed training. (2) Smaller DNNs require less bandwidth to export a new model from the cloud to an autonomous car. (3) Smaller DNNs are more feasible to deploy on FPGAs and other hardware with limited memory. To provide all of these advantages, we propose a small DNN architecture called SqueezeNet. SqueezeNet achieves AlexNet-level accuracy on ImageNet with 50x fewer parameters. Additionally, with model compression techniques we are able to compress SqueezeNet to less than 0.5MB (510x smaller than AlexNet).", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Forrest N. Iandola;Song Han;Matthew W. Moskewicz;Khalid Ashraf;William J. Dally;Kurt Keutzer", "authorids": "forresti@eecs.berkeley.edu;songhan@stanford.edu;moskewcz@eecs.berkeley.edu;kashraf@eecs.berkeley.edu;dally@stanford.edu;keutzer@eecs.berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\niandola2017squeezenet,\ntitle={SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and \\ensuremath{<}0.5{MB} model size},\nauthor={Forrest N. Iandola and Song Han and Matthew W. Moskewicz and Khalid Ashraf and William J. Dally and Kurt Keutzer},\nyear={2017},\nurl={https://openreview.net/forum?id=S1xh5sYgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1xh5sYgx", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 6, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 11251, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17131899958223648583&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14 }, { "id": "SJ-uGHcee", "title": "Efficient iterative policy optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We tackle the issue of finding a good policy when the number of policy updates is limited. This is done by approximating the expected policy reward as a sequence of concave lower bounds which can be efficiently maximized, drastically reducing the number of policy updates required to achieve good performance. We also extend existing methods to negative rewards, enabling the use of control variates.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nicolas Le Roux", "authorids": "nicolas@le-roux.name", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nroux2017efficient,\ntitle={Efficient iterative policy optimization},\nauthor={Nicolas Le Roux},\nyear={2017},\nurl={https://openreview.net/forum?id=SJ-uGHcee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJ-uGHcee", "pdf_size": 0, "rating": "3;5;7", "confidence": "2;4;3", "rating_avg": 5.0, "confidence_avg": 3.0, "replies_avg": 9, "authors#_avg": 1, "corr_rating_confidence": 0.5, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=915960247279139779&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SJ25-B5eg", "title": "The Neural Noisy Channel", "track": "main", "status": "Poster", "tldr": "We formulate sequence to sequence transduction as a noisy channel decoding problem and use recurrent neural networks to parameterise the source and channel models.", "abstract": "We formulate sequence to sequence transduction as a noisy channel decoding problem and use recurrent neural networks to parameterise the source and channel models. Unlike direct models which can suffer from explaining-away effects during training, noisy channel models must produce outputs that explain their inputs, and their component models can be trained with not only paired training samples but also unpaired samples from the marginal output distribution. Using a latent variable to control how much of the conditioning sequence the channel model needs to read in order to generate a subsequent symbol, we obtain a tractable and effective beam search decoder. Experimental results on abstractive sentence summarisation, morphological inflection, and machine translation show that noisy channel models outperform direct models, and that they significantly benefit from increased amounts of unpaired output data that direct models cannot easily use.", "keywords": "Natural language processing;Deep learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Lei Yu;Phil Blunsom;Chris Dyer;Edward Grefenstette;Tomas Kocisky", "authorids": "lei.yu@cs.ox.ac.uk;pblunsom@google.com;cdyer@google.com;etg@google.com;tkocisky@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyu2017the,\ntitle={The Neural Noisy Channel},\nauthor={Lei Yu and Phil Blunsom and Chris Dyer and Edward Grefenstette and Tomas Kocisky},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJ25-B5eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=SJ25-B5eg", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14152670177572241515&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "SJ3rcZcxl", "title": "Q-Prop: Sample-Efficient Policy Gradient with An Off-Policy Critic", "track": "main", "status": "Oral", "tldr": "We propose Q-Prop, a novel policy gradient method with an off-policy critic as control variate, that is more sample efficient than TRPO-GAE and more stable than DDPG, the state-of-the-art on-policy and off-policy methods.", "abstract": "Model-free deep reinforcement learning (RL) methods have been successful in a wide variety of simulated domains. However, a major obstacle facing deep RL in the real world is their high sample complexity. Batch policy gradient methods offer stable learning, but at the cost of high variance, which often requires large batches. TD-style methods, such as off-policy actor-critic and Q-learning, are more sample-efficient but biased, and often require costly hyperparameter sweeps to stabilize. In this work, we aim to develop methods that combine the stability of policy gradients with the efficiency of off-policy RL. We present Q-Prop, a policy gradient method that uses a Taylor expansion of the off-policy critic as a control variate. Q-Prop is both sample efficient and stable, and effectively combines the benefits of on-policy and off-policy methods. We analyze the connection between Q-Prop and existing model-free algorithms, and use control variate theory to derive two variants of Q-Prop with conservative and aggressive adaptation. We show that conservative Q-Prop provides substantial gains in sample efficiency over trust region policy optimization (TRPO) with generalized advantage estimation (GAE), and improves stability over deep deterministic policy gradient (DDPG), the state-of-the-art on-policy and off-policy methods, on OpenAI Gym's MuJoCo continuous control environments.", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Shixiang Gu;Timothy Lillicrap;Zoubin Ghahramani;Richard E. Turner;Sergey Levine", "authorids": "sg717@cam.ac.uk;countzero@google.com;zoubin@eng.cam.ac.uk;ret26@cam.ac.uk;svlevine@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngu2017qprop,\ntitle={Q-Prop: Sample-Efficient Policy Gradient with An Off-Policy Critic},\nauthor={Shixiang Gu and Timothy Lillicrap and Zoubin Ghahramani and Richard E. Turner and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJ3rcZcxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJ3rcZcxl", "pdf_size": 0, "rating": "7;7;7;8", "confidence": "4;5;4;3", "rating_avg": 7.25, "confidence_avg": 4.0, "replies_avg": 29, "authors#_avg": 5, "corr_rating_confidence": -0.816496580927726, "gs_citation": 436, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9727184745997671136&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9 }, { "id": "SJ6yPD5xg", "title": "Reinforcement Learning with Unsupervised Auxiliary Tasks", "track": "main", "status": "Oral", "tldr": "", "abstract": "Deep reinforcement learning agents have achieved state-of-the-art results by directly maximising cumulative reward. However, environments contain a much wider variety of possible training signals. In this paper, we introduce an agent that also maximises many other pseudo-reward functions simultaneously by reinforcement learning. All of these tasks share a common representation that, like unsupervised learning, continues to develop in the absence of extrinsic rewards. We also introduce a novel mechanism for focusing this representation upon extrinsic rewards, so that learning can rapidly adapt to the most relevant aspects of the actual task. Our agent significantly outperforms the previous state-of-the-art on Atari, averaging 880\\% expert human performance, and a challenging suite of first-person, three-dimensional \\emph{Labyrinth} tasks leading to a mean speedup in learning of 10$\\times$ and averaging 87\\% expert human performance on Labyrinth.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Max Jaderberg;Volodymyr Mnih;Wojciech Marian Czarnecki;Tom Schaul;Joel Z Leibo;David Silver;Koray Kavukcuoglu", "authorids": "jaderberg@google.com;vmnih@google.com;lejlot@google.com;schaul@google.com;jzl@google.com;davidsilver@google.com;korayk@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\njaderberg2017reinforcement,\ntitle={Reinforcement Learning with Unsupervised Auxiliary Tasks},\nauthor={Max Jaderberg and Volodymyr Mnih and Wojciech Marian Czarnecki and Tom Schaul and Joel Z Leibo and David Silver and Koray Kavukcuoglu},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJ6yPD5xg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer4", "site": "https://openreview.net/forum?id=SJ6yPD5xg", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;4", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 23, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 1505, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14888805482854497974&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SJ8BZTjeg", "title": "Unsupervised Learning Using Generative Adversarial Training And Clustering", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose an unsupervised learning approach that makes use of two components; a deep hierarchical feature extractor, and a more traditional clustering algorithm. We train the feature extractor in a purely unsupervised manner using generative adversarial training and, in the process, study the strengths of learning using a generative model as an adversary. We also show that adversarial training as done in Generative Adversarial Networks (GANs) is not sufficient to automatically group data into categorical clusters. Instead, we use a more traditional grouping algorithm, k-means\t clustering, to cluster the features learned using adversarial training. We experiment on three well-known datasets, CIFAR-10, CIFAR-100 and STL-10. The experiments show that the proposed approach performs similarly to supervised learning approaches, and, might even be better in situations with small amounts of labeled training data and large amounts of unlabeled data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Vittal Premachandran;Alan L. Yuille", "authorids": "vittalp@jhu.edu;ayuille1@jhu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npremachandran2017unsupervised,\ntitle={Unsupervised Learning Using Generative Adversarial Training And Clustering},\nauthor={Vittal Premachandran and Alan L. Yuille},\nyear={2017},\nurl={https://openreview.net/forum?id=SJ8BZTjeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJ8BZTjeg", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17941792323493584392&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "SJAr0QFxe", "title": "Demystifying ResNet", "track": "main", "status": "Reject", "tldr": "", "abstract": "We provide a theoretical explanation for the superb performance of ResNet via the study of deep linear networks and some nonlinear variants. We show that with or without nonlinearities, by adding shortcuts that have depth two, the condition number of the Hessian of the loss function at the zero initial point is depth-invariant, which makes training very deep models no more difficult than shallow ones. Shortcuts of higher depth result in an extremely flat (high-order) stationary point initially, from which the optimization algorithm is hard to escape. The 1-shortcut, however, is essentially equivalent to no shortcuts. Extensive experiments are provided accompanying our theoretical results. We show that initializing the network to small weights with 2-shortcuts achieves significantly better results than random Gaussian (Xavier) initialization, orthogonal initialization, and shortcuts of deeper depth, from various perspectives ranging from final loss, learning dynamics and stability, to the behavior of the Hessian along the learning process.", "keywords": "Deep learning;Optimization;Theory", "primary_area": "", "supplementary_material": "", "author": "Sihan Li;Jiantao Jiao;Yanjun Han;Tsachy Weissman", "authorids": "lisihan13@mails.tsinghua.edu.cn;jiantao@stanford.edu;yjhan@stanford.edu;tsachy@stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2017demystifying,\ntitle={Demystifying ResNet},\nauthor={Sihan Li and Jiantao Jiao and Yanjun Han and Tsachy Weissman},\nyear={2017},\nurl={https://openreview.net/forum?id=SJAr0QFxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJAr0QFxe", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13989009761452148833&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "SJBr9Mcxl", "title": "Understanding trained CNNs by indexing neuron selectivity", "track": "main", "status": "Reject", "tldr": "", "abstract": "The impressive performance and plasticity of convolutional neural networks to solve different vision problems are shadowed by their black-box nature and its consequent lack of full understanding. To reduce this gap we propose to describe the activity of individual neurons by quantifiyng their inherent selectivity to specific properties. Our approach is based on the definition of feature selectivity indexes that allow the ranking of neurons according to specific properties. Here we report the results of exploring selectivity indexes for: (a) an image feature (color); and (b) an image label (class membership). Our contribution is a framework to seek or classify neurons by indexing on these selectivity properties. It helps to find color selective neurons, such as a red-mushroom neuron in layer conv4 or class selective neurons such as dog-face neurons in layer conv5, and establishes a methodology to derive other selectivity properties. Indexing on neuron selectivity can statistically draw how features and classes are represented through layers at a moment when the size of trained nets is growing and automatic tools to index can be helpful. ", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Ivet Rafegas;Maria Vanrell;Lu\u00eds A. Alexandre", "authorids": "ivet.rafegas@uab.cat;maria.vanrell@uab.cat;lfbaa@ubi.pt", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nrafegas2017understanding,\ntitle={Understanding trained {CNN}s by indexing neuron selectivity},\nauthor={Ivet Rafegas and Maria Vanrell and Lu{\\'\\i}s A. Alexandre},\nyear={2017},\nurl={https://openreview.net/forum?id=SJBr9Mcxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJBr9Mcxl", "pdf_size": 0, "rating": "3;7;7", "confidence": "5;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4539492836056916840&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "SJCscQcge", "title": "Simple Black-Box Adversarial Perturbations for Deep Networks", "track": "main", "status": "Reject", "tldr": "Simple, but highly effective, adversarial attacks on deep neural networks even in the absence of any internal knowledge about the network", "abstract": "Deep neural networks are powerful and popular learning models that achieve state-of-the-art pattern recognition performance on many computer vision, speech, and language processing tasks. However, these networks have also been shown susceptible to carefully crafted adversarial perturbations which force misclassification of the inputs. Adversarial examples enable adversaries to subvert the expected system behavior leading to undesired consequences and could pose a security risk when these systems are deployed in the real world.\n\nIn this work, we focus on deep convolutional neural networks and demonstrate that adversaries can easily craft adversarial examples even without any internal knowledge of the target network. Our attacks treat the network as an oracle (black-box) and only assume that the output of the network can be observed on the probed inputs. Our first attack is based on a simple idea of adding perturbation to a randomly selected single pixel or a small set of them. We then improve the effectiveness of this attack by carefully constructing a small set of pixels to perturb by using the idea of greedy local-search. Our proposed attacks also naturally extend to a stronger notion of misclassification. Our extensive experimental results illustrate that even these elementary attacks can reveal a deep neural network's vulnerabilities. The simplicity and effectiveness of our proposed schemes mean that they could serve as a litmus test while designing robust networks.\n", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Nina Narodytska;Shiva Kasiviswanathan", "authorids": "n.narodytska@gmail.com;kaivisw@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnarodytska2017simple,\ntitle={Simple Black-Box Adversarial Perturbations for Deep Networks},\nauthor={Nina Narodytska and Shiva Kasiviswanathan},\nyear={2017},\nurl={https://openreview.net/forum?id=SJCscQcge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJCscQcge", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;3;4", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 291, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10878822896053833036&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 3 }, { "id": "SJDaqqveg", "title": "An Actor-Critic Algorithm for Sequence Prediction", "track": "main", "status": "Poster", "tldr": "Adapting Actor-Critic methods from reinforcement learning to structured prediction", "abstract": "We present an approach to training neural networks to generate sequences using actor-critic methods from reinforcement learning (RL). Current log-likelihood training methods are limited by the discrepancy between their training and testing modes, as models must generate tokens conditioned on their previous guesses rather than the ground-truth tokens. We address this problem by introducing a textit{critic} network that is trained to predict the value of an output token, given the policy of an textit{actor} network. This results in a training procedure that is much closer to the test phase, and allows us to directly optimize for a task-specific score such as BLEU. Crucially, since we leverage these techniques in the supervised learning setting rather than the traditional RL setting, we condition the critic network on the ground-truth output. We show that our method leads to improved performance on both a synthetic task, and for German-English machine translation. Our analysis paves the way for such methods to be applied in natural language generation tasks, such as machine translation, caption generation, and dialogue modelling. ", "keywords": "Natural language processing;Deep learning;Reinforcement Learning;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Dzmitry Bahdanau;Philemon Brakel;Kelvin Xu;Anirudh Goyal;Ryan Lowe;Joelle Pineau;Aaron Courville;Yoshua Bengio", "authorids": "dimabgv@gmail.com;pbpop3@gmail.com;iamkelvinxu@gmail.com;anirudhgoyal9119@gmail.com;lowe.ryan.t@gmail.com;jpineau@cs.mcgill.ca;aaron.courville@gmail.com;yoshua.bengio@gmail.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nbahdanau2017an,\ntitle={An Actor-Critic Algorithm for Sequence Prediction},\nauthor={Dzmitry Bahdanau and Philemon Brakel and Kelvin Xu and Anirudh Goyal and Ryan Lowe and Joelle Pineau and Aaron Courville and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJDaqqveg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJDaqqveg", "pdf_size": 0, "rating": "4;8;8", "confidence": "4;4;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 23, "authors#_avg": 8, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 758, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5228204938243984917&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "SJGCiw5gl", "title": "Pruning Convolutional Neural Networks for Resource Efficient Inference", "track": "main", "status": "Poster", "tldr": "New approach for removing unnecessary conv neurons from network. Work is focused on how to estimate importance fast and efficiently by Taylor expantion.", "abstract": "We propose a new formulation for pruning convolutional kernels in neural networks to enable efficient inference. We interleave greedy criteria-based pruning with fine-tuning by backpropagation-a computationally efficient procedure that maintains good generalization in the pruned network. We propose a new criterion based on Taylor expansion that approximates the change in the cost function induced by pruning network parameters. We focus on transfer learning, where large pretrained networks are adapted to specialized tasks. The proposed criterion demonstrates superior performance compared to other criteria, e.g. the norm of kernel weights or feature map activation, for pruning large CNNs after adaptation to fine-grained classification tasks (Birds-200 and Flowers-102) relaying only on the first order gradient information. We also show that pruning can lead to more than 10x theoretical reduction in adapted 3D-convolutional filters with a small drop in accuracy in a recurrent gesture classifier. Finally, we show results for the large-scale ImageNet dataset to emphasize the flexibility of our approach.", "keywords": "Deep learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Pavlo Molchanov;Stephen Tyree;Tero Karras;Timo Aila;Jan Kautz", "authorids": "pmolchanov@nvidia.com;styree@nvidia.com;tkarras@nvidia.com;taila@nvidia.com;jkautz@nvidia.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nmolchanov2017pruning,\ntitle={Pruning Convolutional Neural Networks for Resource Efficient Inference},\nauthor={Pavlo Molchanov and Stephen Tyree and Tero Karras and Timo Aila and Jan Kautz},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJGCiw5gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJGCiw5gl", "pdf_size": 0, "rating": "6;7;9", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 20, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 2782, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13741786010220230474&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "SJGPL9Dex", "title": "Understanding Trainable Sparse Coding with Matrix Factorization", "track": "main", "status": "Poster", "tldr": "We analyse the mechanisms which permit to accelerate sparse coding resolution using the problem structure, as it is the case in LISTA.", "abstract": "Sparse coding is a core building block in many data analysis and machine learning pipelines. Typically it is solved by relying on generic optimization techniques, such as the Iterative Soft Thresholding Algorithm and its accelerated version (ISTA, FISTA). These methods are optimal in the class of first-order methods for non-smooth, convex functions. However, they do not exploit the particular structure of the problem at hand nor the input data distribution. An acceleration using neural networks, coined LISTA, was proposed in \\cite{Gregor10}, which showed empirically that one could achieve high quality estimates with few iterations by modifying the parameters of the proximal splitting appropriately.\n\nIn this paper we study the reasons for such acceleration. Our mathematical analysis reveals that it is related to a specific matrix factorization of the Gram kernel of the dictionary, which attempts to nearly diagonalise the kernel with a basis that produces a small perturbation of the $\\ell_1$ ball. When this factorization succeeds, we prove that the resulting splitting algorithm enjoys an improved convergence bound with respect to the non-adaptive version. Moreover, our analysis also shows that conditions for acceleration occur mostly at the beginning of the iterative process, consistent with numerical experiments. We further validate our analysis by showing that on dictionaries where this factorization does not exist, adaptive acceleration fails.", "keywords": "Theory;Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Thomas Moreau;Joan Bruna", "authorids": "thomas.moreau@cmla.ens-cachan.fr;joan.bruna@berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmoreau2017understanding,\ntitle={Understanding Trainable Sparse Coding with Matrix Factorization},\nauthor={Thomas Moreau and Joan Bruna},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJGPL9Dex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJGPL9Dex", "pdf_size": 0, "rating": "5;6;8", "confidence": "2;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.0, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.9819805060619659, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=880859163895253968&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJIMPr9eg", "title": "Boosted Residual Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper we present a new ensemble method, called Boosted Residual Networks,\nwhich builds an ensemble of Residual Networks by growing the member\nnetwork at each round of boosting. The proposed approach combines recent developements\nin Residual Networks - a method for creating very deep networks by\nincluding a shortcut layer between different groups of layers - with the Deep Incremental\nBoosting, which has been proposed as a methodology to train fast ensembles\nof networks of increasing depth through the use of boosting. We demonstrate\nthat the synergy of Residual Networks and Deep Incremental Boosting has better\npotential than simply boosting a Residual Network of fixed structure or using the\nequivalent Deep Incremental Boosting without the shortcut layers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alan Mosca;George D. Magoulas", "authorids": "a.mosca@dcs.bbk.ac.uk;gmagoulas@dcs.bbk.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmosca2017boosted,\ntitle={Boosted Residual Networks},\nauthor={Alan Mosca and George D. Magoulas},\nyear={2017},\nurl={https://openreview.net/forum?id=SJIMPr9eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJIMPr9eg", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;5;5", "rating_avg": 3.3333333333333335, "confidence_avg": 5.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4136932075080351706&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "SJJKxrsgl", "title": "Emergence of foveal image sampling from learning to attend in visual scenes", "track": "main", "status": "Poster", "tldr": "We show a foveal sampling lattice similar to those observed in biology emerges from our model and task.", "abstract": "We describe a neural attention model with a learnable retinal sampling lattice. The model is trained on a visual search task requiring the classification of an object embedded in a visual scene amidst background distractors using the smallest number of fixations. We explore the tiling properties that emerge in the model's retinal sampling lattice after training. Specifically, we show that this lattice resembles the eccentricity dependent sampling lattice of the primate retina, with a high resolution region in the fovea surrounded by a low resolution periphery. Furthermore, we find conditions where these emergent properties are amplified or eliminated providing clues to their function.", "keywords": "Computer vision;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Brian Cheung;Eric Weiss;Bruno Olshausen", "authorids": "bcheung@berkeley.edu;eaweiss@berkeley.edu;baolshausen@berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ncheung2017emergence,\ntitle={Emergence of foveal image sampling from learning to attend in visual scenes},\nauthor={Brian Cheung and Eric Weiss and Bruno Olshausen},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJJKxrsgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJJKxrsgl", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;5", "rating_avg": 5.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5247191877093024920&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "SJJN38cge", "title": "Distributed Transfer Learning for Deep Convolutional Neural Networks by Basic Probability Assignment", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transfer learning is a popular practice in deep neural networks, but fine-tuning of a large number of parameters is a hard challenge due to the complex wiring of neurons between splitting layers and imbalance class distributions of original and transferred domains. Recent advances in evidence theory show that in an imbalance multiclass learning problem, optimizing of proper objective functions based on contingency tables prevents biases towards high-prior classes. Transfer learning usually deals with highly non-convex objectives and local minima in deep neural architectures. We propose a novel distributed transfer learning to tackle both optimization complexity and class-imbalance problem jointly. Our solution imposes separated greedy regularization to each individual convolutional filter to make single-filter neural networks such that the minority classes perform as the majority ones. Then, basic probability assignment from evidence theory boosts these distributed networks to improve the recognition performance on the target domains. Our experiments on several standard datasets confirm the consistent improvement as a result of our distributed transfer learning strategy.", "keywords": "Deep learning;Transfer Learning;Supervised Learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Arash Shahriari", "authorids": "arash.shahriari@csiro.au", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nshahriari2017distributed,\ntitle={Distributed Transfer Learning for Deep Convolutional Neural Networks by Basic Probability Assignment},\nauthor={Arash Shahriari},\nyear={2017},\nurl={https://openreview.net/forum?id=SJJN38cge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJJN38cge", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 1, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:apf5rR49GKUJ:scholar.google.com/&scioq=Distributed+Transfer+Learning+for+Deep+Convolutional+Neural+Networks+by+Basic+Probability+Assignment&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SJMGPrcle", "title": "Learning to Navigate in Complex Environments", "track": "main", "status": "Poster", "tldr": "We proposed a deep RL method, augmented with memory and auxiliary learning targets, for training agents to navigate within large and visually rich environments that include frequently changing start and goal locations", "abstract": "Learning to navigate in complex environments with dynamic elements is an important milestone in developing AI agents. In this work we formulate the navigation question as a reinforcement learning problem and show that data efficiency and task performance can be dramatically improved by relying on additional auxiliary tasks to bootstrap learning. In particular we consider jointly learning the goal-driven reinforcement learning problem with an unsupervised depth prediction task and a self-supervised loop closure classification task. Using this approach we can learn to navigate from raw sensory input in complicated 3D mazes, approaching human-level performance even under conditions where the goal location changes frequently. We provide detailed analysis of the agent behaviour, its ability to localise, and its network activity dynamics, that show that the agent implicitly learns key navigation abilities, with only sparse rewards and without direct supervision.", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Piotr Mirowski;Razvan Pascanu;Fabio Viola;Hubert Soyer;Andy Ballard;Andrea Banino;Misha Denil;Ross Goroshin;Laurent Sifre;Koray Kavukcuoglu;Dharshan Kumaran;Raia Hadsell", "authorids": "piotrmirowski@google.com;razp@google.com;fviola@google.com;soyer@google.com;aybd@google.com;abanino@google.com;mdenil@google.com;goroshin@google.com;sifre@google.com;korayk@google.com;dkumaran@google.com;raia@google.com", "gender": ";;;;;;;;;;;", "homepage": ";;;;;;;;;;;", "dblp": ";;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;", "orcid": ";;;;;;;;;;;", "linkedin": ";;;;;;;;;;;", "or_profile": ";;;;;;;;;;;", "aff": ";;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;", "position": ";;;;;;;;;;;", "bibtex": "@inproceedings{\nmirowski2017learning,\ntitle={Learning to Navigate in Complex Environments},\nauthor={Piotr Mirowski and Razvan Pascanu and Fabio Viola and Hubert Soyer and Andy Ballard and Andrea Banino and Misha Denil and Ross Goroshin and Laurent Sifre and Koray Kavukcuoglu and Dharshan Kumaran and Raia Hadsell},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJMGPrcle}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SJMGPrcle", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;5;3", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 22, "authors#_avg": 12, "corr_rating_confidence": 0.0, "gs_citation": 1075, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17642659027854201917&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SJNDWNOlg", "title": "What Is the Best Practice for CNNs Applied to Visual Instance Retrieval?", "track": "main", "status": "Reject", "tldr": "", "abstract": "Previous work has shown that feature maps of deep convolutional neural networks (CNNs)\ncan be interpreted as feature representation of a particular image region. Features aggregated from\nthese feature maps have been exploited for image retrieval tasks and achieved state-of-the-art performances in\nrecent years. The key to the success of such methods is the feature representation. However, the different\nfactors that impact the effectiveness of features are still not explored thoroughly. There are much less\ndiscussion about the best combination of them.\n\nThe main contribution of our paper is the thorough evaluations of the various factors that affect the\ndiscriminative ability of the features extracted from CNNs. Based on the evaluation results, we also identify \nthe best choices for different factors and propose a new multi-scale image feature representation method to \nencode the image effectively. Finally, we show that the proposed method generalises well and outperforms \nthe state-of-the-art methods on four typical datasets used for visual instance retrieval.", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Jiedong Hao;Jing Dong;Wei Wang;Tieniu Tan", "authorids": "jiedong.hao@cripac.ia.ac.cn;jdong@nlpr.ia.ac.cn;wwang@nlpr.ia.ac.cn;tnt@nlpr.ia.ac.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhao2017what,\ntitle={What Is the Best Practice for {CNN}s Applied to Visual Instance Retrieval?},\nauthor={Jiedong Hao and Jing Dong and Wei Wang and Tieniu Tan},\nyear={2017},\nurl={https://openreview.net/forum?id=SJNDWNOlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJNDWNOlg", "pdf_size": 0, "rating": "3;3;6", "confidence": "5;5;4", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16299847323548311322&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "SJQNqLFgl", "title": "Deep Convolutional Neural Network Design Patterns", "track": "main", "status": "Reject", "tldr": "We take a high-level view of the network architectures as the basis for discovering universal principles of the design of convolutional neural network architecture.. ", "abstract": "Recent research in the deep learning field has produced a plethora of new architectures. At the same time, a growing number of groups are applying deep learning to new applications. Some of these groups are likely to be composed of inexperienced deep learning practitioners who are baffled by the dizzying array of architecture choices and therefore opt to use an older architecture (i.e., Alexnet). Here we attempt to bridge this gap by mining the collective knowledge contained in recent deep learning research to discover underlying principles for designing neural network architectures. In addition, we describe several architectural innovations, including Fractal of FractalNet network, Stagewise Boosting Networks, and Taylor Series Networks (our Caffe code and prototxt files are available at https://github.com/iPhysicist/CNNDesignPatterns). We hope others are inspired to build on our preliminary work.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Leslie N. Smith;Nicholay Topin", "authorids": "leslie.smith@nrl.navy.mil;ntopin@umbc.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsmith2017deep,\ntitle={Deep Convolutional Neural Network Design Patterns},\nauthor={Leslie N. Smith and Nicholay Topin},\nyear={2017},\nurl={https://openreview.net/forum?id=SJQNqLFgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJQNqLFgl", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13221893251685351825&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SJRpRfKxx", "title": "Recurrent Mixture Density Network for Spatiotemporal Visual Attention", "track": "main", "status": "Poster", "tldr": "", "abstract": "In many computer vision tasks, the relevant information to solve the problem at hand is mixed to irrelevant, distracting information. This has motivated researchers to design attentional models that can dynamically focus on parts of images or videos that are salient, e.g., by down-weighting irrelevant pixels. In this work, we propose a spatiotemporal attentional model that learns where to look in a video directly from human fixation data. We model visual attention with a mixture of Gaussians at each frame. This distribution is used to express the probability of saliency for each pixel. Time consistency in videos is modeled hierarchically by: 1) deep 3D convolutional features to represent spatial and short-term time relations and 2) a long short-term memory network on top that aggregates the clip-level representation of sequential clips and therefore expands the temporal domain from few frames to seconds. The parameters of the proposed model are optimized via maximum likelihood estimation using human fixations as training data, without knowledge of the action in each video. Our experiments on Hollywood2 show state-of-the-art performance on saliency prediction for video. We also show that our attentional model trained on Hollywood2 generalizes well to UCF101 and it can be leveraged to improve action classification accuracy on both datasets.", "keywords": "Computer vision;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Loris Bazzani;Hugo Larochelle;Lorenzo Torresani", "authorids": "loris.bazzani@gmail.com;hugo.larochelle@usherbrooke.ca;lt@dartmouth.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbazzani2017recurrent,\ntitle={Recurrent Mixture Density Network for Spatiotemporal Visual Attention},\nauthor={Loris Bazzani and Hugo Larochelle and Lorenzo Torresani},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJRpRfKxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJRpRfKxx", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 20, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2141363771438095104&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SJTQLdqlg", "title": "Learning to Remember Rare Events", "track": "main", "status": "Poster", "tldr": "We introduce a memory module for life-long learning that adds one-shot learning capability to any supervised neural network.", "abstract": "Despite recent advances, memory-augmented deep neural networks are still limited\nwhen it comes to life-long and one-shot learning, especially in remembering rare events.\nWe present a large-scale life-long memory module for use in deep learning.\nThe module exploits fast nearest-neighbor algorithms for efficiency and\nthus scales to large memory sizes.\nExcept for the nearest-neighbor query, the module is fully differentiable\nand trained end-to-end with no extra supervision. It operates in\na life-long manner, i.e., without the need to reset it during training.\n\nOur memory module can be easily added to any part of a supervised neural network.\nTo show its versatility we add it to a number of networks, from simple\nconvolutional ones tested on image classification to deep sequence-to-sequence\nand recurrent-convolutional models.\nIn all cases, the enhanced network gains the ability to remember\nand do life-long one-shot learning.\nOur module remembers training examples shown many thousands\nof steps in the past and it can successfully generalize from them.\nWe set new state-of-the-art for one-shot learning on the Omniglot dataset\nand demonstrate, for the first time, life-long one-shot learning in\nrecurrent neural networks on a large-scale machine translation task.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Lukasz Kaiser;Ofir Nachum;Aurko Roy;Samy Bengio", "authorids": "lukaszkaiser@google.com;ofirnachum@google.com;aurko@gatech.edu;bengio@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nkaiser2017learning,\ntitle={Learning to Remember Rare Events},\nauthor={Lukasz Kaiser and Ofir Nachum and Aurko Roy and Samy Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJTQLdqlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJTQLdqlg", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;4;3", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 434, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=704947026913270950&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "SJU4ayYgl", "title": "Semi-Supervised Classification with Graph Convolutional Networks", "track": "main", "status": "Poster", "tldr": "Semi-supervised classification with a CNN model for graphs. State-of-the-art results on a number of citation network datasets.", "abstract": "We present a scalable approach for semi-supervised learning on graph-structured data that is based on an efficient variant of convolutional neural networks which operate directly on graphs. We motivate the choice of our convolutional architecture via a localized first-order approximation of spectral graph convolutions. Our model scales linearly in the number of graph edges and learns hidden layer representations that encode both local graph structure and features of nodes. In a number of experiments on citation networks and on a knowledge graph dataset we demonstrate that our approach outperforms related methods by a significant margin.", "keywords": "Deep learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Thomas N. Kipf;Max Welling", "authorids": "T.N.Kipf@uva.nl;M.Welling@uva.nl", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nkipf2017semisupervised,\ntitle={Semi-Supervised Classification with Graph Convolutional Networks},\nauthor={Thomas N. Kipf and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJU4ayYgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJU4ayYgl", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 45220, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9692529718922546949&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 23 }, { "id": "SJUdkecgx", "title": "", "track": "main", "status": "Reject", "tldr": "We propose: ", "abstract": " ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "", "authorids": "", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\n2017,\ntitle={},\nauthor={},\nyear={2017},\nurl={https://openreview.net/forum?id=SJUdkecgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJUdkecgx", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;3;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 1, "corr_rating_confidence": -1.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SJZAb5cel", "title": "A Joint Many-Task Model: Growing a Neural Network for Multiple NLP Tasks", "track": "main", "status": "Reject", "tldr": "A single deep multi-task learning model for five different NLP tasks.", "abstract": "Transfer and multi-task learning have traditionally focused on either a single source-target pair or very few, similar tasks. Ideally, the linguistic levels of morphology, syntax and semantics would benefit each other by being trained in a single model. We introduce such a joint many-task model together with a strategy for successively growing its depth to solve increasingly complex tasks. All layers include shortcut connections to both word representations and lower-level task predictions. We use a simple regularization term to allow for optimizing all model weights to improve one task's loss without exhibiting catastrophic interference of the other tasks. Our single end-to-end trainable model obtains state-of-the-art results on chunking, dependency parsing, semantic relatedness and textual entailment. It also performs competitively on POS tagging. Our dependency parsing layer relies only on a single feed-forward pass and does not require a beam search.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Kazuma Hashimoto;Caiming Xiong;Yoshimasa Tsuruoka;Richard Socher", "authorids": "hassy@logos.t.u-tokyo.ac.jp;cxiong@salesforce.com;tsuruoka@logos.t.u-tokyo.ac.jp;rsocher@salesforce.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhashimoto2017a,\ntitle={A Joint Many-Task Model: Growing a Neural Network for Multiple {NLP} Tasks},\nauthor={Kazuma Hashimoto and Caiming Xiong and Yoshimasa Tsuruoka and Richard Socher},\nyear={2017},\nurl={https://openreview.net/forum?id=SJZAb5cel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJZAb5cel", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 684, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3437546579340829684&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SJ_QCYqle", "title": "Semi-Supervised Detection of Extreme Weather Events in Large Climate Datasets", "track": "main", "status": "Reject", "tldr": "Semi-supervised 3D CNN's improve bounding box detection of weather events in climate simulations compared to supervised approaches.", "abstract": "The detection and identification of extreme weather events in large scale climate simulations is an important problem for risk management, informing governmental policy decisions and advancing our basic understanding of the climate system.\nRecent work has shown that fully supervised convolutional neural networks (CNNs) can yield acceptable accuracy for classifying well-known types of extreme weather events when large amounts of labeled data are available. However, there are many different types of spatially localized climate patterns of interest (including hurricanes, extra-tropical cyclones, weather fronts, blocking events, etc.)\nfound in simulation data for which labeled data is not available at large scale for all simulations of interest.\nWe present a multichannel spatiotemporal encoder-decoder CNN architecture for semi-supervised bounding box prediction and exploratory data analysis.\nThis architecture is designed to fully model multi-channel simulation data, temporal dynamics and unlabelled data within a reconstruction and prediction framework so as to improve the detection of a wide range of extreme weather events. \nOur architecture can be viewed as a 3D convolutional autoencoder with an additional modified one-pass bounding box regression loss. \nWe demonstrate that our approach is able to leverage temporal information and unlabelled data to improve localization of extreme weather events. Further, we explore the representations learned by our model in order to better understand this important data, and facilitate further work in understanding and mitigating the effects of climate change.", "keywords": "Semi-Supervised Learning;Applications;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Evan Racah;Christopher Beckham;Tegan Maharaj;Prabhat;Christopher Pal", "authorids": "eracah@lbl.gov;christopher.beckham@polymtl.ca;tegan.maharaj@polymtl.ca;prabhat@lbl.gov;christopher.pal@polymtl.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nracah2017semisupervised,\ntitle={Semi-Supervised Detection of Extreme Weather Events in Large Climate Datasets},\nauthor={Evan Racah and Christopher Beckham and Tegan Maharaj and Prabhat and Christopher Pal},\nyear={2017},\nurl={https://openreview.net/forum?id=SJ_QCYqle}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJ_QCYqle", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18159740708525045726&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SJc1hL5ee", "title": "FastText.zip: Compressing text classification models", "track": "main", "status": "Reject", "tldr": "Compressing text classification models", "abstract": "We consider the problem of producing compact architectures for text classification, such that the full model fits in a limited amount of memory. After considering different solutions inspired by the hashing literature, we propose a method built upon product quantization to store the word embeddings. While the original technique leads to a loss in accuracy, we adapt this method to circumvent the quantization artifacts. As a result, our approach produces a text classifier, derived from the fastText approach, which at test time requires only a fraction of the memory compared to the original one, without noticeably sacrificing the quality in terms of classification accuracy. Our experiments carried out on several benchmarks show that our approach typically requires two orders of magnitude less memory than fastText while being only slightly inferior with respect to accuracy. As a result, it outperforms the state of the art by a good margin in terms of the compromise between memory usage and accuracy.", "keywords": "Natural language processing;Supervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Armand Joulin;Edouard Grave;Piotr Bojanowski;Matthijs Douze;Herve Jegou;Tomas Mikolov", "authorids": "ajoulin@fb.com;egrave@fb.com;bojanowski@fb.com;matthijs@fb.com;rvj@fb.com;tmikolov@fb.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\njoulin2017fasttextzip,\ntitle={FastText.zip: Compressing text classification models},\nauthor={Armand Joulin and Edouard Grave and Piotr Bojanowski and Matthijs Douze and Herve Jegou and Tomas Mikolov},\nyear={2017},\nurl={https://openreview.net/forum?id=SJc1hL5ee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJc1hL5ee", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1890, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11568944492984166845&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SJg498clg", "title": "Neural Graph Machines: Learning Neural Networks Using Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Label propagation is a powerful and flexible semi-supervised learning technique on graphs. Neural network architectures, on the other hand, have proven track records in many supervised learning tasks. In this work, we propose a training objective for neural networks, Neural Graph Machines, for combining the power of neural networks and label propagation. The new objective allows the neural networks to harness both labeled and unlabeled data by: (a) allowing the network to train using labeled data as in the supervised setting, (b) biasing the network to learn similar hidden representations for neighboring nodes on a graph, in the same vein as label propagation. Such architectures with the proposed objective can be trained efficiently using stochastic gradient descent and scaled to large graphs. The proposed method is experimentally validated on a wide range of tasks (multi- label classification on social graphs, news categorization and semantic intent classification) using different architectures (NNs, CNNs, and LSTM RNNs).", "keywords": "Semi-Supervised Learning;Natural language processing;Applications", "primary_area": "", "supplementary_material": "", "author": "Thang D. Bui;Sujith Ravi;Vivek Ramavajjala", "authorids": "tdb40@cam.ac.uk;sravi@google.com;vramavaj@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbui2017neural,\ntitle={Neural Graph Machines: Learning Neural Networks Using Graphs},\nauthor={Thang D. Bui and Sujith Ravi and Vivek Ramavajjala},\nyear={2017},\nurl={https://openreview.net/forum?id=SJg498clg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJg498clg", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12514164089612964498&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SJgWQPcxl", "title": "Multi-view Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "We describe the MV-BiGAN model able to perform density estimation from multiple views, and to update its prediction when additional views are provided", "abstract": "Learning over multi-view data is a challenging problem with strong practical applications. Most related studies focus on the classification point of view and assume that all the views are available at any time. We consider an extension of this framework in two directions. First, based on the BiGAN model, the Multi-view BiGAN (MV-BiGAN) is able to perform density estimation from multi-view inputs. Second, it can deal with missing views and is able to update its prediction when additional views are provided. We illustrate these properties on a set of experiments over different datasets.", "keywords": "Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Micka\u00ebl Chen;Ludovic Denoyer", "authorids": "mickael.chen@lip6.fr;ludovic.denoyer@lip6.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nchen2017multiview,\ntitle={Multi-view Generative Adversarial Networks},\nauthor={Micka{\\\"e}l Chen and Ludovic Denoyer},\nyear={2017},\nurl={https://openreview.net/forum?id=SJgWQPcxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJgWQPcxl", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.7559289460184545, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1189692556669212001&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 8 }, { "id": "SJiFvr9el", "title": "Linear Time Complexity Deep Fourier Scattering Network and Extension to Nonlinear Invariants", "track": "main", "status": "Reject", "tldr": "This paper proposes an extension of the Scattering Network in the Fourier domain and with nonlinear invariant computation for fast and scalable unsupervised representations", "abstract": "In this paper we propose a scalable version of a state-of-the-art deterministic time-\ninvariant feature extraction approach based on consecutive changes of basis and\nnonlinearities, namely, the scattering network. The first focus of the paper is to\nextend the scattering network to allow the use of higher order nonlinearities as\nwell as extracting nonlinear and Fourier based statistics leading to the required in-\nvariants of any inherently structured input. In order to reach fast convolutions and\nto leverage the intrinsic structure of wavelets, we derive our complete model in the\nFourier domain. In addition of providing fast computations, we are now able to\nexploit sparse matrices due to extremely high sparsity well localized in the Fourier\ndomain. As a result, we are able to reach a true linear time complexity with in-\nputs in the Fourier domain allowing fast and energy efficient solutions to machine\nlearning tasks. Validation of the features and computational results will be pre-\nsented through the use of these invariant coefficients to perform classification on\naudio recordings of bird songs captured in multiple different soundscapes. In the\nend, the applicability of the presented solutions to deep artificial neural networks\nis discussed.", "keywords": "Unsupervised Learning;Applications;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Randall Balestriero;Herve Glotin", "authorids": "randallbalestriero@gmail.com;glotin@univ-tln.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbalestriero2017linear,\ntitle={Linear Time Complexity Deep Fourier Scattering Network and Extension to Nonlinear Invariants},\nauthor={Randall Balestriero and Herve Glotin},\nyear={2017},\nurl={https://openreview.net/forum?id=SJiFvr9el}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJiFvr9el", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;5;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=41143781370838719&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SJk01vogl", "title": "Adversarial examples for generative models", "track": "main", "status": "Reject", "tldr": "Exploration of ways to attack generative models with adversarial examples and why someone might want to do that.", "abstract": "We explore methods of producing adversarial examples on deep generative models such as the variational autoencoder (VAE) and the VAE-GAN. Deep learning architectures are known to be vulnerable to adversarial examples, but previous work has focused on the application of adversarial examples to classification tasks. Deep generative models have recently become popular due to their ability to model input data distributions and generate realistic examples from those distributions. We present three classes of attacks on the VAE and VAE-GAN architectures and demonstrate them against networks trained on MNIST, SVHN and CelebA. Our first attack leverages classification-based adversaries by attaching a classifier to the trained encoder of the target generative model, which can then be used to indirectly manipulate the latent representation. Our second attack directly uses the VAE loss function to generate a target reconstruction image from the adversarial example. Our third attack moves beyond relying on classification or the standard loss for the gradient and directly optimizes against differences in source and target latent representations. We also motivate why an attacker might be interested in deploying such techniques against a target generative network.", "keywords": "Computer vision;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Jernej Kos;Ian Fischer;Dawn Song", "authorids": "jernej@kos.mx;iansf@google.com;dawnsong.travel@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkos2017adversarial,\ntitle={Adversarial examples for generative models},\nauthor={Jernej Kos and Ian Fischer and Dawn Song},\nyear={2017},\nurl={https://openreview.net/forum?id=SJk01vogl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SJk01vogl", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 323, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17431509835415516866&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "SJkXfE5xx", "title": "Revisiting Classifier Two-Sample Tests", "track": "main", "status": "Poster", "tldr": "Modern binary classifiers can be easily turned into powerful two-sample tests, and used to evaluate generative models.", "abstract": "The goal of two-sample tests is to assess whether two samples, $S_P \\sim P^n$ and $S_Q \\sim Q^m$, are drawn from the same distribution. Perhaps intriguingly, one relatively unexplored method to build two-sample tests is the use of binary classifiers. In particular, construct a dataset by pairing the $n$ examples in $S_P$ with a positive label, and by pairing the $m$ examples in $S_Q$ with a negative label. If the null hypothesis ``$P = Q$'' is true, then the classification accuracy of a binary classifier on a held-out subset of this dataset should remain near chance-level. As we will show, such \\emph{Classifier Two-Sample Tests} (C2ST) learn a suitable representation of the data on the fly, return test statistics in interpretable units, have a simple null distribution, and their predictive uncertainty allow to interpret where $P$ and $Q$ differ.\n\nThe goal of this paper is to establish the properties, performance, and uses of C2ST. First, we analyze their main theoretical properties. Second, we compare their performance against a variety of state-of-the-art alternatives. Third, we propose their use to evaluate the sample quality of generative models with intractable likelihoods, such as Generative Adversarial Networks (GANs). Fourth, we showcase the novel application of GANs together with C2ST for causal discovery.", "keywords": "Theory;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "David Lopez-Paz;Maxime Oquab", "authorids": "dlp@fb.com;qas@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nlopez-paz2017revisiting,\ntitle={Revisiting Classifier Two-Sample Tests},\nauthor={David Lopez-Paz and Maxime Oquab},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJkXfE5xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJkXfE5xx", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;5", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 21, "authors#_avg": 2, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 537, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12680900003954123128&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6 }, { "id": "SJqaCVLxx", "title": "New Learning Approach By Genetic Algorithm In A Convolutional Neural Network For Pattern Recognition", "track": "main", "status": "Reject", "tldr": "Implement new approach without exerting backpropagation in learning of CNN is useful for parallel processing Like GPU.", "abstract": "Almost all of the presented articles in the CNN are based on the error backpropagation algorithm and calculation of derivations of error, our innovative proposal refers to engaging TICA filters and NSGA-II genetic algorithms to train the LeNet-5 CNN network. Consequently, genetic algorithm updates the weights of LeNet-5 CNN network similar to chromosome update. In our approach the weights of LeNet-5 are obtained in two stages. The first is pre-training and the second is fine-tuning. As a result, our approach impacts in learning task.", "keywords": "Deep learning;Supervised Learning;Optimization;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Mohammad Ali Mehrolhassani;Majid Mohammadi", "authorids": "Alimehrolhassani@yahoo.com;Mohammadi@uk.ac.ir", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmehrolhassani2017new,\ntitle={New Learning Approach By Genetic Algorithm In A Convolutional Neural Network For Pattern Recognition},\nauthor={Mohammad Ali Mehrolhassani and Majid Mohammadi},\nyear={2017},\nurl={https://openreview.net/forum?id=SJqaCVLxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJqaCVLxx", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;5;5", "rating_avg": 2.6666666666666665, "confidence_avg": 5.0, "replies_avg": 21, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6UqYdWhQQq4J:scholar.google.com/&scioq=New+Learning+Approach+By+Genetic+Algorithm+In+A+Convolutional+Neural+Network+For+Pattern+Recognition&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJttqw5ge", "title": "Communicating Hierarchical Neural Controllers for Learning Zero-shot Task Generalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "The ability to generalize from past experience to solve previously unseen tasks is a key research challenge in reinforcement learning (RL). In this paper, we consider RL tasks defined as a sequence of high-level instructions described by natural language and study two types of generalization: to unseen and longer sequences of previously seen instructions, and to sequences where the instructions themselves were previously not seen. \nWe present a novel hierarchical deep RL architecture that consists of two interacting neural controllers: a meta controller that reads instructions and repeatedly communicates subtasks to a subtask controller that in turn learns to perform such subtasks. To generalize better to unseen instructions, we propose a regularizer that encourages to learn subtask embeddings that capture correspondences between similar subtasks. We also propose a new differentiable neural network architecture in the meta controller that learns temporal abstractions which makes learning more stable under delayed reward. Our architecture is evaluated on a stochastic 2D grid world and a 3D visual environment where the agent should execute a list of instructions. We demonstrate that the proposed architecture is able to generalize well over unseen instructions as well as longer lists of instructions.", "keywords": "Reinforcement Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Junhyuk Oh;Satinder Singh;Honglak Lee;Pushmeet Kohli", "authorids": "junhyuk@umich.edu;baveja@umich.edu;honglak@umich.edu;pkohli@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\noh2017communicating,\ntitle={Communicating Hierarchical Neural Controllers for Learning Zero-shot Task Generalization},\nauthor={Junhyuk Oh and Satinder Singh and Honglak Lee and Pushmeet Kohli},\nyear={2017},\nurl={https://openreview.net/forum?id=SJttqw5ge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=SJttqw5ge", "pdf_size": 0, "rating": "3;4;5;7", "confidence": "4;3;5;0", "rating_avg": 4.75, "confidence_avg": 3.0, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": -0.7228063223242011, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5nEo4Sbe1XkJ:scholar.google.com/&scioq=Communicating+Hierarchical+Neural+Controllers+for+Learning+Zero-shot+Task+Generalization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SJvYgH9xe", "title": "Automatic Rule Extraction from Long Short Term Memory Networks", "track": "main", "status": "Poster", "tldr": "We introduce a word importance score for LSTMs, and show that we can use it to replicate an LSTM's performance using a simple, rules-based classifier.", "abstract": "Although deep learning models have proven effective at solving problems in natural language processing, the mechanism by which they come to their conclusions is often unclear. As a result, these models are generally treated as black boxes, yielding no insight of the underlying learned patterns. In this paper we consider Long Short Term Memory networks (LSTMs) and demonstrate a new approach for tracking the importance of a given input to the LSTM for a given output. By identifying consistently important patterns of words, we are able to distill state of the art LSTMs on sentiment analysis and question answering into a set of representative phrases. This representation is then quantitatively validated by using the extracted phrases to construct a simple, rule-based classifier which approximates the output of the LSTM.", "keywords": "Natural language processing;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "W. James Murdoch;Arthur Szlam", "authorids": "jmurdoch@berkeley.edu;aszlam@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmurdoch2017automatic,\ntitle={Automatic Rule Extraction from Long Short Term Memory Networks},\nauthor={W. James Murdoch and Arthur Szlam},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJvYgH9xe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJvYgH9xe", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12848514333189090104&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SJx7Jrtgl", "title": "Deep Unsupervised Clustering with Gaussian Mixture Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "We study a variant of the variational autoencoder model with a Gaussian mixture as prior distribution and discuss its optimization difficulties and capabilities for unsupervised clustering.", "abstract": "We study a variant of the variational autoencoder model (VAE) with a Gaussian mixture as a prior distribution, with the goal of performing unsupervised clustering through deep generative models. We observe that the known problem of over-regularisation that has been shown to arise in regular VAEs also manifests itself in our model and leads to cluster degeneracy. We show that a heuristic called minimum information constraint that has been shown to mitigate this effect in VAEs can also be applied to improve unsupervised clustering performance with our model. Furthermore we analyse the effect of this heuristic and provide an intuition of the various processes with the help of visualizations. Finally, we demonstrate the performance of our model on synthetic data, MNIST and SVHN, showing that the obtained clusters are distinct, interpretable and result in achieving competitive performance on unsupervised clustering to the state-of-the-art results.", "keywords": "Unsupervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Nat Dilokthanakul;Pedro A. M. Mediano;Marta Garnelo;Matthew C.H. Lee;Hugh Salimbeni;Kai Arulkumaran;Murray Shanahan", "authorids": "n.dilokthanakul14@imperial.ac.uk;pmediano@imperial.ac.uk;m.garnelo-abellanas13@imperial.ac.uk;matthew.lee13@imperial.ac.uk;h.salimbeni15@imperial.ac.uk;kailash.arulkumaran13@imperial.ac.uk;m.shanahan@imperial.ac.uk", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\ndilokthanakul2017deep,\ntitle={Deep Unsupervised Clustering with Gaussian Mixture Variational Autoencoders},\nauthor={Nat Dilokthanakul and Pedro A. M. Mediano and Marta Garnelo and Matthew C.H. Lee and Hugh Salimbeni and Kai Arulkumaran and Murray Shanahan},\nyear={2017},\nurl={https://openreview.net/forum?id=SJx7Jrtgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJx7Jrtgl", "pdf_size": 0, "rating": "4;4;8", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 775, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5552972016491760977&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7 }, { "id": "SJzCSf9xg", "title": "On Detecting Adversarial Perturbations", "track": "main", "status": "Poster", "tldr": "We present and evaluate an approach for detecting adversarial perturbations in images based on attaching a small subnetwork to a deep neural network that is trained specifically to detect adversarial perturbations.", "abstract": "Machine learning and deep learning in particular has advanced tremendously on perceptual tasks in recent years. However, it remains vulnerable against adversarial perturbations of the input that have been crafted specifically to fool the system while being quasi-imperceptible to a human. In this work, we propose to augment deep neural networks with a small ``detector'' subnetwork which is trained on the binary classification task of distinguishing genuine data from data containing adversarial perturbations. Our method is orthogonal to prior work on addressing adversarial perturbations, which has mostly focused on making the classification network itself more robust. We show empirically that adversarial perturbations can be detected surprisingly well even though they are quasi-imperceptible to humans. Moreover, while the detectors have been trained to detect only a specific adversary, they generalize to similar and weaker adversaries. In addition, we propose an adversarial attack that fools both the classifier and the detector and a novel training procedure for the detector that counteracts this attack. ", "keywords": "Computer vision;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Jan Hendrik Metzen;Tim Genewein;Volker Fischer;Bastian Bischoff", "authorids": "JanHendrik.Metzen@de.bosch.com;Tim.Genewein@de.bosch.com;Volker.Fischer@de.bosch.com;Bastian.Bischoff@de.bosch.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmetzen2017on,\ntitle={On Detecting Adversarial Perturbations},\nauthor={Jan Hendrik Metzen and Tim Genewein and Volker Fischer and Bastian Bischoff},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SJzCSf9xg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJzCSf9xg", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 1242, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2337805679039722044&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "Sk-oDY9ge", "title": "Diet Networks: Thin Parameters for Fat Genomics", "track": "main", "status": "Poster", "tldr": "Drastically reducing the number of parameters, when the number of input features is orders of magnitude larger than the number of training examples, such as in genomics.", "abstract": "Learning tasks such as those involving genomic data often poses a serious challenge: the number of input features can be orders of magnitude larger than the number of training examples, making it difficult to avoid overfitting, even when using the known regularization techniques. We focus here on tasks in which the input is a description of the genetic variation specific to a patient, the single nucleotide polymorphisms (SNPs), yielding millions of ternary inputs. Improving the ability of deep learning to handle such datasets could have an important impact in medical research, more specifically in precision medicine, where high-dimensional data regarding a particular patient is used to make predictions of interest. Even though the amount of data for such tasks is increasing, this mismatch between the number of examples and the number of inputs remains a concern. Naive implementations of classifier neural networks involve a huge number of free parameters in their first layer (number of input features times number of hidden units): each input feature is associated with as many parameters as there are hidden units. We propose a novel neural network parametrization which considerably reduces the number of free parameters. It is based on the idea that we can first learn or provide a distributed representation for each input feature (e.g. for each position in the genome where variations are observed in data), and then learn (with another neural network called the parameter prediction network) how to map a feature's distributed representation (based on the feature's identity not its value) to the vector of parameters specific to that feature in the classifier neural network (the weights which link the value of the feature to each of the hidden units). This approach views the problem of producing the parameters associated with each feature as a multi-task learning problem. We show experimentally on a population stratification task of interest to medical studies that the proposed approach can significantly reduce both the number of parameters and the error rate of the classifier.", "keywords": "Deep learning;Unsupervised Learning;Supervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Adriana Romero;Pierre Luc Carrier;Akram Erraqabi;Tristan Sylvain;Alex Auvolat;Etienne Dejoie;Marc-Andr\u00e9 Legault;Marie-Pierre Dub\u00e9;Julie G. Hussin;Yoshua Bengio", "authorids": "adriana.romero.soriano@umontreal.ca;pierre-luc.carrier@umontreal.ca;akram.er-raqabi@umontreal.ca;Tristan.sylvain@umontreal.ca;;etiennedejoie@gmail.com;marc-andre.legault.1@umontreal.ca;;;yoshua.umontreal@gmail.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\nromero2017diet,\ntitle={Diet Networks: Thin Parameters for Fat Genomics},\nauthor={Adriana Romero and Pierre Luc Carrier and Akram Erraqabi and Tristan Sylvain and Alex Auvolat and Etienne Dejoie and Marc-Andr{\\'e} Legault and Marie-Pierre Dub{\\'e} and Julie G. Hussin and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Sk-oDY9ge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Sk-oDY9ge", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;3;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 20, "authors#_avg": 10, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16474858947438365090&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "Sk2Im59ex", "title": "Unsupervised Cross-Domain Image Generation", "track": "main", "status": "Poster", "tldr": "", "abstract": "We study the problem of transferring a sample in one domain to an analog sample in another domain. Given two related domains, S and T, we would like to learn a generative function G that maps an input sample from S to the domain T, such that the output of a given representation function f, which accepts inputs in either domains, would remain unchanged. Other than f, the training data is unsupervised and consist of a set of samples from each domain, without any mapping between them. The Domain Transfer Network (DTN) we present employs a compound loss function that includes a multiclass GAN loss, an f preserving component, and a regularizing component that encourages G to map samples from T to themselves. We apply our method to visual domains including digits and face images and demonstrate its ability to generate convincing novel images of previously unseen entities, while preserving their identity.", "keywords": "Computer vision;Deep learning;Unsupervised Learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Yaniv Taigman;Adam Polyak;Lior Wolf", "authorids": "yaniv@fb.com;adampolyak@fb.com;wolf@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ntaigman2017unsupervised,\ntitle={Unsupervised Cross-Domain Image Generation},\nauthor={Yaniv Taigman and Adam Polyak and Lior Wolf},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Sk2Im59ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Sk2Im59ex", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 33, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1288, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1045007962742744076&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "Sk2iistgg", "title": "Non-linear Dimensionality Regularizer for Solving Inverse Problems", "track": "main", "status": "Reject", "tldr": "Predicting causal factors of an inverse problem which lie near unknown low-dimensional non-linear manifold defined by a mercer kernel.", "abstract": "Consider an ill-posed inverse problem of estimating causal factors from observations, one of which is known to lie near some (unknown) low-dimensional, non-linear manifold expressed by a predefined Mercer-kernel. Solving this problem requires simultaneous estimation of these factors and learning the low-dimensional representation for them. In this work, we introduce a novel non-linear dimensionality regularization technique for solving such problems without pre-training.\nWe re-formulate Kernel-PCA as an energy minimization problem in which low dimensionality constraints are introduced as regularization terms in the energy.\nTo the best of our knowledge, ours is the first attempt to create a dimensionality regularizer in the KPCA framework. Our approach relies on robustly penalizing the rank of the recovered factors directly in the implicit feature space to create\ntheir low-dimensional approximations in closed form. Our approach performs robust KPCA in the presence of missing data and noise.\nWe demonstrate state-of-the-art results on predicting missing entries in the standard oil flow dataset. Additionally, we evaluate our method on the challenging problem of Non-Rigid Structure from Motion and our approach delivers promising results on CMU mocap dataset despite the presence of significant occlusions and noise.", "keywords": "Computer vision;Optimization;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Ravi Garg;Anders Eriksson;Ian Reid", "authorids": "ravi.garg@adelaide.edu.au;anders.eriksson@qut.edu.au;ian.reid@adelaide.edu.au", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngarg2017nonlinear,\ntitle={Non-linear Dimensionality Regularizer for Solving Inverse Problems},\nauthor={Ravi Garg and Anders Eriksson and Ian Reid},\nyear={2017},\nurl={https://openreview.net/forum?id=Sk2iistgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=Sk2iistgg", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13056284705210155617&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "Sk36NgFeg", "title": "Filling in the details: Perceiving from low fidelity visual input", "track": "main", "status": "Reject", "tldr": "Using generative models to create images from impoverished input similar to those received by our visual cortex", "abstract": "Humans perceive their surroundings in great detail even though most of our visual field is reduced to low-fidelity color-deprived (e.g., dichromatic) input by the retina. In contrast, most deep learning architectures deploy computational resources homogeneously to every part of the visual input. Is such a prodigal deployment of resources necessary? In this paper, we present a framework for investigating the extent to which connectionist architectures can perceive an image in full detail even when presented with low acuity, distorted input. Our goal is to initiate investigations that will be fruitful both for engineering better networks and also for eventually testing hypotheses on the neural mechanisms responsible for our own visual system's ability to perceive missing information. We find that networks can compensate for low acuity input by learning global feature functions that allow the network to fill in some of the missing details. For example, the networks accurately perceive shape and color in the periphery, even when 75\\% of the input is achromatic and low resolution. On the other hand, the network is prone to similar mistakes as humans; for example, when presented with a fully grayscale landscape image, it perceives the sky as blue when the sky is actually a red sunset. ", "keywords": "Deep learning;Computer vision;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Farahnaz A. Wick;Michael L. Wick;Marc Pomplun", "authorids": "fwick@cs.umb.edu;mwick@cs.umass.edu;mpomplun@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwick2017filling,\ntitle={Filling in the details: Perceiving from low fidelity visual input},\nauthor={Farahnaz A. Wick and Michael L. Wick and Marc Pomplun},\nyear={2017},\nurl={https://openreview.net/forum?id=Sk36NgFeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Sk36NgFeg", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;5;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mwtKqPFVF-IJ:scholar.google.com/&scioq=Filling+in+the+details:+Perceiving+from+low+fidelity+visual+input&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "id": "Sk8J83oee", "title": "Generative Adversarial Parallelization", "track": "main", "status": "Reject", "tldr": "Creating Synergy with Multiple Generative Adversarial Networks", "abstract": "Generative Adversarial Networks (GAN) have become one of the most studied frameworks for unsupervised learning due to their intuitive formulation. They have also been shown to be capable of generating convincing examples in limited domains, such as low-resolution images. However, they still prove difficult to train in practice and tend to ignore modes of the data generating distribution. Quantitatively capturing effects such as mode coverage and more generally the quality of the generative model still remain elusive. We propose Generative Adversarial Parallelization (GAP), a framework in which many GANs or their variants are trained simultaneously, exchanging their discriminators. This eliminates the tight coupling between a generator and discriminator, leading to improved convergence and improved coverage of modes. We also propose an improved variant of the recently proposed Generative Adversarial Metric and show how it can score individual GANs or their collections under the GAP model.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Daniel Jiwoong Im;He Ma;Chris Dongjoo Kim;Graham Taylor", "authorids": "daniel.im@aifounded.com;hma02@uoguelph.ca;ckim07@uoguelph.ca;gwtaylor@uoguelph.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nim2017generative,\ntitle={Generative Adversarial Parallelization},\nauthor={Daniel Jiwoong Im and He Ma and Chris Dongjoo Kim and Graham Taylor},\nyear={2017},\nurl={https://openreview.net/forum?id=Sk8J83oee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Sk8J83oee", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;4;4", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4616997731147415307&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Sk8csP5ex", "title": "The loss surface of residual networks: Ensembles and the role of batch normalization", "track": "main", "status": "Reject", "tldr": "Residual nets are dynamic ensembles", "abstract": "Deep Residual Networks present a premium in performance in comparison to conventional\nnetworks of the same depth and are trainable at extreme depths. It has\nrecently been shown that Residual Networks behave like ensembles of relatively\nshallow networks. We show that these ensemble are dynamic: while initially\nthe virtual ensemble is mostly at depths lower than half the network\u2019s depth, as\ntraining progresses, it becomes deeper and deeper. The main mechanism that controls\nthe dynamic ensemble behavior is the scaling introduced, e.g., by the Batch\nNormalization technique. We explain this behavior and demonstrate the driving\nforce behind it. As a main tool in our analysis, we employ generalized spin glass\nmodels, which we also use in order to study the number of critical points in the\noptimization of Residual Networks.", "keywords": "Deep learning;Theory", "primary_area": "", "supplementary_material": "", "author": "Etai Littwin;Lior Wolf", "authorids": "etai.littwin@gmail.com;liorwolf@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlittwin2017the,\ntitle={The loss surface of residual networks: Ensembles and the role of batch normalization},\nauthor={Etai Littwin and Lior Wolf},\nyear={2017},\nurl={https://openreview.net/forum?id=Sk8csP5ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Sk8csP5ex", "pdf_size": 0, "rating": "3;7;7", "confidence": "5;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 2, "corr_rating_confidence": -1.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17094398529350068613&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SkB-_mcel", "title": "Central Moment Discrepancy (CMD) for Domain-Invariant Representation Learning", "track": "main", "status": "Poster", "tldr": "A new method for hidden activation distribution matching in the context of domain adaptation.", "abstract": "The learning of domain-invariant representations in the context of domain adaptation with neural networks is considered. We propose a new regularization method that minimizes the domain-specific latent feature representations directly in the hidden activation space. Although some standard distribution matching approaches exist that can be interpreted as the matching of weighted sums of moments, e.g. Maximum Mean Discrepancy (MMD), an explicit order-wise matching of higher order moments has not been considered before.\nWe propose to match the higher order central moments of probability distributions by means of order-wise moment differences. Our model does not require computationally expensive distance and kernel matrix computations. We utilize the equivalent representation of probability distributions by moment sequences to define a new distance function, called Central Moment Discrepancy (CMD). We prove that CMD is a metric on the set of probability distributions on a compact interval. We further prove that convergence of probability distributions on compact intervals w.r.t. the new metric implies convergence in distribution of the respective random variables.\nWe test our approach on two different benchmark data sets for object recognition (Office) and sentiment analysis of product reviews (Amazon reviews). CMD achieves a new state-of-the-art performance on most domain adaptation tasks of Office and outperforms networks trained with MMD, Variational Fair Autoencoders and Domain Adversarial Neural Networks on Amazon reviews. In addition, a post-hoc parameter sensitivity analysis shows that the new approach is stable w. r. t. parameter changes in a certain interval. The source code of the experiments is publicly available.", "keywords": "Transfer Learning;Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Werner Zellinger;Thomas Grubinger;Edwin Lughofer;Thomas Natschl\u00e4ger;Susanne Saminger-Platz", "authorids": "werner.zellinger@jku.at;thomas.grubinger@scch.at;edwin.lughofer@jku.at;thomas.natschlaeger@scch.at;susanne.saminger-platz@jku.at", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzellinger2017central,\ntitle={Central Moment Discrepancy ({CMD}) for Domain-Invariant Representation Learning},\nauthor={Werner Zellinger and Thomas Grubinger and Edwin Lughofer and Thomas Natschl{\\\"a}ger and Susanne Saminger-Platz},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SkB-_mcel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkB-_mcel", "pdf_size": 0, "rating": "6;7;9", "confidence": "4;4;5", "rating_avg": 7.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.944911182523068, "gs_citation": 766, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11153161380714770222&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SkBsEQYll", "title": "Learning similarity preserving representations with neural similarity and context encoders", "track": "main", "status": "Reject", "tldr": "Neural network way of doing kernel PCA and an extension of word2vec to compute out-of-vocabulary embeddings and distinguish between multiple meanings of a word based on its local context.", "abstract": "We introduce similarity encoders (SimEc), which learn similarity preserving representations by using a feed-forward neural network to map data into an embedding space where the original similarities can be approximated linearly. The model can easily compute representations for novel (out-of-sample) data points, even if the original pairwise similarities of the training set were generated by an unknown process such as human ratings. This is demonstrated by creating embeddings of both image and text data.\nFurthermore, the idea behind similarity encoders gives an intuitive explanation of the optimization strategy used by the continuous bag-of-words (CBOW) word2vec model trained with negative sampling. Based on this insight, we define context encoders (ConEc), which can improve the word embeddings created with word2vec by using the local context of words to create out-of-vocabulary embeddings and representations for words with multiple meanings. The benefit of this is illustrated by using these word embeddings as features in the CoNLL 2003 named entity recognition task.", "keywords": "Natural language processing;Unsupervised Learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Franziska Horn;Klaus-Robert M\u00fcller", "authorids": "franziska.horn@campus.tu-berlin.de;klaus-robert.mueller@tu-berlin.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhorn2017learning,\ntitle={Learning similarity preserving representations with neural similarity and context encoders},\nauthor={Franziska Horn and Klaus-Robert M{\\\"u}ller},\nyear={2017},\nurl={https://openreview.net/forum?id=SkBsEQYll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkBsEQYll", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7898158641201961740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SkCILwqex", "title": "Exploring LOTS in Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "We introduce layerwise origin-target synthesis (LOTS) that can be used for visualizing internal representations of deep neural networks, and for adversarial example generation.", "abstract": "Deep neural networks have recently demonstrated excellent performance on various tasks. Despite recent advances, our understanding of these learning models is still incomplete, at least, as their unexpected vulnerability to imperceptibly small, non-random perturbations revealed. The existence of these so-called adversarial examples presents a serious problem of the application of vulnerable machine learning models. In this paper, we introduce the layerwise origin-target synthesis (LOTS) that can serve multiple purposes. First, we can use it as a visualization technique that gives us insights into the function of any intermediate feature layer by showing the notion of a particular input in deep neural networks. Second, our approach can be applied to assess the invariance of the learned features captured at any layer with respect to the class of the particular input. Finally, we can also utilize LOTS as a general way of producing a vast amount of diverse adversarial examples that can be used for training to further improve the robustness of machine learning models and their performance as well.", "keywords": "Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Andras Rozsa;Manuel Gunther;Terrance E. Boult", "authorids": "andras.rozsa@yahoo.com;siebenkopf@googlemail.com;tboult@vast.uccs.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nrozsa2017exploring,\ntitle={Exploring {LOTS} in Deep Neural Networks},\nauthor={Andras Rozsa and Manuel Gunther and Terrance E. Boult},\nyear={2017},\nurl={https://openreview.net/forum?id=SkCILwqex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkCILwqex", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 20, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=879598926018234085&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SkC_7v5gx", "title": "The Power of Sparsity in Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "Sparse random connections that allow savings to be harvested and that are very effective at compressing CNNs.", "abstract": "Deep convolutional networks are well-known for their high computational and memory demands. Given limited resources, how does one design a network that balances its size, training time, and prediction accuracy? A surprisingly effective approach to trade accuracy for size and speed is to simply reduce the number of channels in each convolutional layer by a fixed fraction and retrain the network. In many cases this leads to significantly smaller networks with only minimal changes to accuracy. In this paper, we take a step further by empirically examining a strategy for deactivating connections between filters in convolutional layers in a way that allows us to harvest savings both in run-time and memory for many network architectures. More specifically, we generalize 2D convolution to use a channel-wise sparse connection structure and show that this leads to significantly better results than the baseline approach for large networks including VGG and Inception V3.", "keywords": "Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Soravit Changpinyo;Mark Sandler;Andrey Zhmoginov", "authorids": "schangpi@usc.edu;sandler@google.com;azhmogin@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchangpinyo2017the,\ntitle={The Power of Sparsity in Convolutional Neural Networks},\nauthor={Soravit Changpinyo and Mark Sandler and Andrey Zhmoginov},\nyear={2017},\nurl={https://openreview.net/forum?id=SkC_7v5gx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkC_7v5gx", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": -0.944911182523068, "gs_citation": 171, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10345466128348939552&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SkJeEtclx", "title": "Memory-augmented Attention Modelling for Videos", "track": "main", "status": "Reject", "tldr": "We propose a novel memory-based attention model for video description", "abstract": "Recent works on neural architectures have demonstrated the utility of attention mechanisms for a wide variety of tasks. Attention models used for problems such as image captioning typically depend on the image under consideration, as well as the previous sequence of words that come before the word currently being generated. While these types of models have produced impressive results, they are not able to model the higher-order interactions involved in problems such as video description/captioning, where the relationship between parts of the video and the concepts being depicted is complex. Motivated by these observations, we propose a novel memory-based attention model for video description. Our model utilizes memories of past attention when reasoning about where to attend to in the current time step, similar to the central executive system proposed in human cognition. This allows the model to not only reason about local attention more effectively, it allows it to consider the entire sequence of video frames while generating each word. Evaluation on the challenging and popular MSVD and Charades datasets show that the proposed architecture outperforms all previously proposed methods and leads to a new state of the art results in the video description.", "keywords": "Deep learning;Multi-modal learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Rasool Fakoor;Abdel-rahman Mohamed;Margaret Mitchell;Sing Bing Kang;Pushmeet Kohli", "authorids": "rasool.fakoor@mavs.uta.edu;asamir@microsoft.com;margarmitchell@gmail.com;SingBing.Kang@microsoft.com;pkohli@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nfakoor2017memoryaugmented,\ntitle={Memory-augmented Attention Modelling for Videos},\nauthor={Rasool Fakoor and Abdel-rahman Mohamed and Margaret Mitchell and Sing Bing Kang and Pushmeet Kohli},\nyear={2017},\nurl={https://openreview.net/forum?id=SkJeEtclx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkJeEtclx", "pdf_size": 0, "rating": "4;4;4", "confidence": "0;5;4", "rating_avg": 4.0, "confidence_avg": 3.0, "replies_avg": 20, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6778810533299340561&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SkXIrV9le", "title": "Perception Updating Networks: On architectural constraints for interpretable video generative models", "track": "main", "status": "Workshop", "tldr": "Decoupled \"what\" and \"where\" variational statistical framework and equivalent multi-stream network ", "abstract": "We investigate a neural network architecture and statistical framework that models frames in videos using principles inspired by computer graphics pipelines. The proposed model explicitly represents \"sprites\" or its percepts inferred from maximum likelihood of the scene and infers its movement independently of its content. We impose architectural constraints that forces resulting architecture to behave as a recurrent what-where prediction network.", "keywords": "Structured prediction;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Eder Santana;Jose C Principe", "authorids": "edercsjr@gmail.com;principe@cnel.ufl.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkXIrV9le", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;3", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SkYbF1slg", "title": "An Information-Theoretic Framework for Fast and Robust Unsupervised Learning via Neural Population Infomax", "track": "main", "status": "Poster", "tldr": "We present a novel information-theoretic framework for fast and robust unsupervised Learning via information maximization for neural population coding.", "abstract": "A framework is presented for unsupervised learning of representations based on infomax principle for large-scale neural populations. We use an asymptotic approximation to the Shannon's mutual information for a large neural population to demonstrate that a good initial approximation to the global information-theoretic optimum can be obtained by a hierarchical infomax method. Starting from the initial solution, an efficient algorithm based on gradient descent of the final objective function is proposed to learn representations from the input datasets, and the method works for complete, overcomplete, and undercomplete bases. As confirmed by numerical experiments, our method is robust and highly efficient for extracting salient features from input datasets. Compared with the main existing methods, our algorithm has a distinct advantage in both the training speed and the robustness of unsupervised representation learning. Furthermore, the proposed method is easily extended to the supervised or unsupervised model for training deep structure networks.", "keywords": "Unsupervised Learning;Theory;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Wentao Huang;Kechen Zhang", "authorids": "whuang21@jhmi.edu;kzhang4@jhmi.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhuang2017an,\ntitle={An Information-Theoretic Framework for Fast and Robust Unsupervised Learning via Neural Population Infomax},\nauthor={Wentao Huang and Kechen Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SkYbF1slg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkYbF1slg", "pdf_size": 0, "rating": "5;7;8", "confidence": "2;3;2", "rating_avg": 6.666666666666667, "confidence_avg": 2.3333333333333335, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.18898223650461363, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2183324722689487655&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SkgSXUKxx", "title": "An Analysis of Feature Regularization for Low-shot Learning", "track": "main", "status": "Reject", "tldr": "An analysis of adding regularization for low-shot learning", "abstract": "Low-shot visual learning, the ability to recognize novel object categories from very few, or even one example, is a hallmark of human visual intelligence. Though successful on many tasks, deep learning approaches tends to be notoriously data-hungry. Recently, feature penalty regularization has been proved effective on capturing new concepts. In this work, we provide both empirical evidence and theoretical analysis on how and why these methods work. We also propose a better design of cost function with improved performance. Close scrutiny reveals the centering effect of feature representation, as well as the intrinsic connection with batch normalization. Extensive experiments on synthetic datasets, the one-shot learning benchmark \u201cOmniglot\u201d, and large-scale ImageNet validate our analysis.", "keywords": "Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Zhuoyuan Chen;Han Zhao;Xiao Liu;Wei Xu", "authorids": "chenzhuoyuan@baidu.com;liuxiao12@baidu.com;wei.xu@baidu.com;han.zhao@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2017an,\ntitle={An Analysis of Feature Regularization for Low-shot Learning},\nauthor={Zhuoyuan Chen and Han Zhao and Xiao Liu and Wei Xu},\nyear={2017},\nurl={https://openreview.net/forum?id=SkgSXUKxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkgSXUKxx", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6694279309749693015&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SkgewU5ll", "title": "GRAM: Graph-based Attention Model for Healthcare Representation Learning", "track": "main", "status": "Reject", "tldr": "We propose a novel attention mechanism on graphs to learn representations for medical concepts from both data and medical ontologies to cope with insufficient data volume.", "abstract": "Deep learning methods exhibit promising performance for predictive modeling in healthcare, but two important challenges remain:\n- Data insufficiency: Often in healthcare predictive modeling, the sample size is insufficient for deep learning methods to achieve satisfactory results. \n- Interpretation: The representations learned by deep learning models should align with medical knowledge.\nTo address these challenges, we propose a GRaph-based Attention Model, GRAM that supplements electronic health records (EHR) with hierarchical information inherent to medical ontologies. \nBased on the data volume and the ontology structure, GRAM represents a medical concept as a combination of its ancestors in the ontology via an attention mechanism. \nWe compared predictive performance (i.e. accuracy, data needs, interpretability) of GRAM to various methods including the recurrent neural network (RNN) in two sequential diagnoses prediction tasks and one heart failure prediction task.\nCompared to the basic RNN, GRAM achieved 10% higher accuracy for predicting diseases rarely observed in the training data and 3% improved area under the ROC curve for predicting heart failure using an order of magnitude less training data. Additionally, unlike other methods, the medical concept representations learned by GRAM are well aligned with the medical ontology. Finally, GRAM exhibits intuitive attention behaviors by adaptively generalizing to higher level concepts when facing data insufficiency at the lower level concepts.", "keywords": "Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Edward Choi;Mohammad Taha Bahadori;Le Song;Walter F. Stewart;Jimeng Sun", "authorids": "mp2893@gatech.edu;bahadori@gatech.edu;lsong@cc.gatech.edu;stewarwf@sutterhealth.org;jsun@cc.gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nchoi2017gram,\ntitle={{GRAM}: Graph-based Attention Model for Healthcare Representation Learning},\nauthor={Edward Choi and Mohammad Taha Bahadori and Le Song and Walter F. Stewart and Jimeng Sun},\nyear={2017},\nurl={https://openreview.net/forum?id=SkgewU5ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkgewU5ll", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 868, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9237090392530766045&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14 }, { "id": "SkhU2fcll", "title": "Deep Multi-task Representation Learning: A Tensor Factorisation Approach", "track": "main", "status": "Poster", "tldr": "A multi-task representation learning framework that learns cross-task sharing structure at every layer in a deep network.", "abstract": "Most contemporary multi-task learning methods assume linear models. This setting is considered shallow in the era of deep learning. In this paper, we present a new deep multi-task representation learning framework that learns cross-task sharing structure at every layer in a deep network. Our approach is based on generalising the matrix factorisation techniques explicitly or implicitly used by many conventional MTL algorithms to tensor factorisation, to realise automatic learning of end-to-end knowledge sharing in deep networks. This is in contrast to existing deep learning approaches that need a user-defined multi-task sharing strategy. Our approach applies to both homogeneous and heterogeneous MTL. Experiments demonstrate the efficacy of our deep multi-task representation learning in terms of both higher accuracy and fewer design choices.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yongxin Yang;Timothy M. Hospedales", "authorids": "yongxin.yang@qmul.ac.uk;t.hospedales@qmul.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nyang2017deep,\ntitle={Deep Multi-task Representation Learning: A Tensor Factorisation Approach},\nauthor={Yongxin Yang and Timothy M. Hospedales},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SkhU2fcll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkhU2fcll", "pdf_size": 0, "rating": "5;7;8", "confidence": "3;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.9449111825230683, "gs_citation": 330, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15402082780595310810&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "SkkTMpjex", "title": "Distributed Second-Order Optimization using Kronecker-Factored Approximations", "track": "main", "status": "Poster", "tldr": "Fixed typos pointed out by AnonReviewer1 and AnonReviewer4 and added the experiments in Fig. 6 showing the poor scaling of batch normalized SGD using a batch size of 2048 on googlenet. ", "abstract": "As more computational resources become available, machine learning researchers train ever larger neural networks on millions of data points using stochastic gradient descent (SGD). Although SGD scales well in terms of both the size of dataset and the number of parameters of the model, it has rapidly diminishing returns as parallel computing resources increase. Second-order optimization methods have an affinity for well-estimated gradients and large mini-batches, and can therefore benefit much more from parallel computation in principle. Unfortunately, they often employ severe approximations to the curvature matrix in order to scale to large models with millions of parameters, limiting their effectiveness in practice versus well-tuned SGD with momentum. The recently proposed K-FAC method(Martens and Grosse, 2015) uses a stronger and more sophisticated curvature approximation, and has been shown to make much more per-iteration progress than SGD, while only introducing a modest overhead. In this paper, we develop a version of K-FAC that distributes the computation of gradients and additional quantities required by K-FAC across multiple machines, thereby taking advantage of method\u2019s superior scaling to large mini-batches and mitigating its additional overheads. We provide a Tensorflow implementation of our approach which is easy to use and can be applied to many existing codebases without modification. Additionally, we develop several algorithmic enhancements to K-FAC which can improve its computational performance for very large models. Finally, we show that our distributed K-FAC method speeds up training of various state-of-the-art ImageNet classification models by a factor of two compared to Batch Normalization(Ioffe and Szegedy, 2015).", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Jimmy Ba;Roger Grosse;James Martens", "authorids": "jimmy@psi.toronto.edu;rgrosse@cs.toronto.edu;jmartens@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nba2017distributed,\ntitle={Distributed Second-Order Optimization using Kronecker-Factored Approximations},\nauthor={Jimmy Ba and Roger Grosse and James Martens},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SkkTMpjex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SkkTMpjex", "pdf_size": 0, "rating": "6;7", "confidence": "3;4", "rating_avg": 6.5, "confidence_avg": 3.5, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.9999999999999999, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3347200104411983272&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "Skn9Shcxe", "title": "Highway and Residual Networks learn Unrolled Iterative Estimation", "track": "main", "status": "Poster", "tldr": "", "abstract": "The past year saw the introduction of new architectures such as Highway networks and Residual networks which, for the first time, enabled the training of feedforward networks with dozens to hundreds of layers using simple gradient descent.\nWhile depth of representation has been posited as a primary reason for their success, there are indications that these architectures defy a popular view of deep learning as a hierarchical computation of increasingly abstract features at each layer.\n\nIn this report, we argue that this view is incomplete and does not adequately explain several recent findings.\nWe propose an alternative viewpoint based on unrolled iterative estimation---a group of successive layers iteratively refine their estimates of the same features instead of computing an entirely new representation.\nWe demonstrate that this viewpoint directly leads to the construction of highway and residual networks. \nFinally we provide preliminary experiments to discuss the similarities and differences between the two architectures.", "keywords": "Theory;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Klaus Greff;Rupesh K. Srivastava;J\u00fcrgen Schmidhuber", "authorids": "klaus@idsia.ch;rupesh@idsia.ch;juergen@idsia.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngreff2017highway,\ntitle={Highway and Residual Networks learn Unrolled Iterative Estimation},\nauthor={Klaus Greff and Rupesh K. Srivastava and J{\\\"u}rgen Schmidhuber},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Skn9Shcxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Skn9Shcxe", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;4;4", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 277, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14457128463377455102&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "SkpSlKIel", "title": "Why Deep Neural Networks for Function Approximation?", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recently there has been much interest in understanding why deep neural networks are preferred to shallow networks. We show that, for a large class of piecewise smooth functions, the number of neurons needed by a shallow network to approximate a function is exponentially larger than the corresponding number of neurons needed by a deep network for a given degree of function approximation. First, we consider univariate functions on a bounded interval and require a neural network to achieve an approximation error of $\\varepsilon$ uniformly over the interval. We show that shallow networks (i.e., networks whose depth does not depend on $\\varepsilon$) require $\\Omega(\\text{poly}(1/\\varepsilon))$ neurons while deep networks (i.e., networks whose depth grows with $1/\\varepsilon$) require $\\mathcal{O}(\\text{polylog}(1/\\varepsilon))$ neurons. We then extend these results to certain classes of important multivariate functions. Our results are derived for neural networks which use a combination of rectifier linear units (ReLUs) and binary step units, two of the most popular type of activation functions. Our analysis builds on a simple observation: the multiplication of two bits can be represented by a ReLU.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shiyu Liang;R. Srikant", "authorids": "sliang26@illinois.edu;rsrikant@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nliang2017why,\ntitle={Why Deep Neural Networks for Function Approximation?},\nauthor={Shiyu Liang and R. Srikant},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SkpSlKIel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkpSlKIel", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 458, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8793393504564238281&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "Skq89Scxx", "title": "SGDR: Stochastic Gradient Descent with Warm Restarts", "track": "main", "status": "Poster", "tldr": "We propose a simple warm restart technique for stochastic gradient descent to improve its anytime performance.", "abstract": "Restart techniques are common in gradient-free optimization to deal with multimodal functions. Partial warm restarts are also gaining popularity in gradient-based optimization to improve the rate of convergence in accelerated gradient schemes to deal with ill-conditioned functions. In this paper, we propose a simple warm restart technique for stochastic gradient descent to improve its anytime performance when training deep neural networks. We empirically study its performance on the CIFAR-10 and CIFAR-100 datasets, \nwhere we demonstrate new state-of-the-art results at 3.14\\% and 16.21\\%, respectively. We also demonstrate its advantages on a dataset of EEG recordings and on a downsampled version of the ImageNet dataset. Our source code is available at \\\\ \\url{https://github.com/loshchil/SGDR}", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Ilya Loshchilov;Frank Hutter", "authorids": "ilya@cs.uni-freiburg.de;fh@cs.uni-freiburg.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nloshchilov2017sgdr,\ntitle={{SGDR}: Stochastic Gradient Descent with Warm Restarts},\nauthor={Ilya Loshchilov and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Skq89Scxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=Skq89Scxx", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;5;3", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 10671, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9496349859848656559&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "SkqMSCHxe", "title": "PREDICTION OF POTENTIAL HUMAN INTENTION USING SUPERVISED COMPETITIVE LEARNING", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a learning method to quantify human intention. Generally, a human being will imagine several potential actions for a given scene, but only one of these actions will subsequently be taken. This makes it difficult to quantify human intentions.\nTo solve this problem, we apply competitive learning to human behavior prediction as supervised learning. In our approach, competitive learning generates several outputs that are then associated with several potential situations imagined by a human. We applied the proposed method to human driving behavior and extracted three potential driving patterns. Results showed a squared error is reduced to 1/25 that of a conventional method . We also found that competitive learning can distinguish valid data from disturbance data in order to train a model.", "keywords": "Computer vision;Deep learning;Supervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Masayoshi Ishikawa;Mariko Okude;Takehisa Nishida & Kazuo Muto", "authorids": "masayoshi.ishikawa.gv@hitachi.com;mariko.okude.uh@hitachi.com;takehisa.nishida.cu@hitachi.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nishikawa2017prediction,\ntitle={{PREDICTION} {OF} {POTENTIAL} {HUMAN} {INTENTION} {USING} {SUPERVISED} {COMPETITIVE} {LEARNING}},\nauthor={Masayoshi Ishikawa and Mariko Okude and Takehisa Nishida {\\&} Kazuo Muto},\nyear={2017},\nurl={https://openreview.net/forum?id=SkqMSCHxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkqMSCHxe", "pdf_size": 0, "rating": "2;2;4", "confidence": "4;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZCD_pdGQSYkJ:scholar.google.com/&scioq=PREDICTION+OF+POTENTIAL+HUMAN+INTENTION+USING+SUPERVISED+COMPETITIVE+LEARNING&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "Sks3zF9eg", "title": "Taming the waves: sine as activation function in deep neural networks", "track": "main", "status": "Reject", "tldr": "Why nets with sine as activation function are difficult to train in theory. Also, they often don't use the periodic part if not needed, but when it's beneficial they might learn faster", "abstract": "Most deep neural networks use non-periodic and monotonic\u2014or at least\nquasiconvex\u2014 activation functions. While sinusoidal activation functions have\nbeen successfully used for specific applications, they remain largely ignored and\nregarded as difficult to train. In this paper we formally characterize why these\nnetworks can indeed often be difficult to train even in very simple scenarios, and\ndescribe how the presence of infinitely many and shallow local minima emerges\nfrom the architecture. We also provide an explanation to the good performance\nachieved on a typical classification task, by showing that for several network architectures\nthe presence of the periodic cycles is largely ignored when the learning\nis successful. Finally, we show that there are non-trivial tasks\u2014such as learning\nalgorithms\u2014where networks using sinusoidal activations can learn faster than\nmore established monotonic functions.", "keywords": "Theory;Deep learning;Optimization;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Giambattista Parascandolo;Heikki Huttunen;Tuomas Virtanen", "authorids": "giambattista.parascandolo@tut.fi;heikki.huttunen@tut.fi;tuomas.virtanen@tut.fi", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nparascandolo2017taming,\ntitle={Taming the waves: sine as activation function in deep neural networks},\nauthor={Giambattista Parascandolo and Heikki Huttunen and Tuomas Virtanen},\nyear={2017},\nurl={https://openreview.net/forum?id=Sks3zF9eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Sks3zF9eg", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9907367808466370693&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Sks9_ajex", "title": "Paying More Attention to Attention: Improving the Performance of Convolutional Neural Networks via Attention Transfer", "track": "main", "status": "Poster", "tldr": "", "abstract": "Attention plays a critical role in human visual experience. Furthermore, it has recently been demonstrated that attention can also play an important role in the context of applying artificial neural networks to a variety of tasks from fields such as computer vision and NLP. In this work we show that, by properly defining attention for convolutional neural networks, we can actually use this type of information in order to significantly improve the performance of a student CNN network by forcing it to mimic the attention maps of a powerful teacher network. To that end, we propose several novel methods of transferring attention, showing consistent improvement across a variety of datasets and convolutional neural network architectures.", "keywords": "Computer vision;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Sergey Zagoruyko;Nikos Komodakis", "authorids": "sergey.zagoruyko@enpc.fr;nikos.komodakis@enpc.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzagoruyko2017paying,\ntitle={Paying More Attention to Attention: Improving the Performance of Convolutional Neural Networks via Attention Transfer},\nauthor={Sergey Zagoruyko and Nikos Komodakis},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Sks9_ajex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Sks9_ajex", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 3439, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8439472615885524081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "SkuqA_cgx", "title": "Automated Generation of Multilingual Clusters for the Evaluation of Distributed Representations", "track": "main", "status": "Workshop", "tldr": "Applying simple heuristics to the Wikidata entity graph results in a high-quality semantic similarity dataset.", "abstract": "We propose a language-agnostic way of automatically generating sets of semantically similar clusters of entities along with sets of \"outlier\" elements, which may then be used to perform an intrinsic evaluation of word embeddings in the outlier detection task. We used our methodology to create a gold-standard dataset, which we call WikiSem500, and evaluated multiple state-of-the-art embeddings. The results show a correlation between performance on this dataset and performance on sentiment analysis.", "keywords": "Natural language processing;Applications", "primary_area": "", "supplementary_material": "", "author": "Philip Blair;Yuval Merhav;Joel Barry", "authorids": "pblair@basistech.com;yuval@basistech.com;joelb@basistech.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkuqA_cgx", "pdf_size": 0, "rating": "5;6;8", "confidence": "3;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.944911182523068, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15010971537518099310&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Skvgqgqxe", "title": "Learning to Compose Words into Sentences with Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "We use reinforcement learning to learn\ntree-structured neural networks for computing representations of natural language sentences.\nIn contrast with prior work on tree-structured models, in which the trees are either provided as input or\npredicted using supervision from explicit treebank annotations,\nthe tree structures in this work are optimized to improve performance on a downstream task.\nExperiments demonstrate the benefit of\nlearning task-specific composition orders, outperforming both sequential encoders and recursive encoders based on treebank annotations.\nWe analyze the induced trees and show that while they discover\nsome linguistically intuitive structures (e.g., noun phrases, simple verb phrases),\nthey are different than conventional English syntactic structures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dani Yogatama;Phil Blunsom;Chris Dyer;Edward Grefenstette;Wang Ling", "authorids": "dyogatama@google.com;pblunsom@google.com;cdyer@google.com;etg@google.com;lingwang@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyogatama2017learning,\ntitle={Learning to Compose Words into Sentences with Reinforcement Learning},\nauthor={Dani Yogatama and Phil Blunsom and Chris Dyer and Edward Grefenstette and Wang Ling},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Skvgqgqxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Skvgqgqxe", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;5;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 24, "authors#_avg": 5, "corr_rating_confidence": 0.5, "gs_citation": 208, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17745324246331075431&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SkwSJ99ex", "title": "DeepRebirth: A General Approach for Accelerating Deep Neural Network Execution on Mobile Devices", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deploying deep neural networks on mobile devices is a challenging task due to computation complexity and memory intensity. Existing works solve this problem by reducing model size using weight approximation methods based on dimension reduction (i.e., SVD, Tucker decomposition and Quantization). However, the execution speed of these compressed models are still far below the real-time processing requirement of mobile services. To address this limitation, we propose a novel acceleration framework: DeepRebirth by exploring the deep learning model parameter sparsity through merging the parameter-free layers with their neighbor convolution layers to a single dense layer. The design of DeepRebirth is motivated by the key observation: some layers (i.e., normalization and pooling) in deep learning models actually consume a large portion of computational time even few learned parameters are involved, and acceleration of these layers has the potential to improve the processing speed significantly. Essentially, the functionality of several merged layers is replaced by the new dense layer \u2013 rebirth layer in DeepRebirth. In order to preserve the same functionality, the rebirth layer model parameters are re-trained to be functionality equivalent to the original several merged layers. The extensive experiments performed on ImageNet using several popular mobile devices demonstrate that DeepRebirth is not only providing huge speed-up in model deployment and significant memory saving but also maintaining the model accuracy, i.e., 3x-5x speed-up and energy saving on GoogLeNet with only 0.4% accuracy drop on top-5 categorization in ImageNet. Further, by combining with other model compression techniques, DeepRebirth offers an average of 65ms model forwarding time on each image using Samsung Galaxy S6 with only 2.4% accuracy drop. In addition, 2.5x run-time memory saving is achieved with rebirth layers.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dawei Li;Xiaolong Wang;Deguang Kong;Mooi Choo Chuah", "authorids": ";;;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2017deeprebirth,\ntitle={DeepRebirth: A General Approach for Accelerating Deep Neural Network Execution on Mobile Devices},\nauthor={Dawei Li and Xiaolong Wang and Deguang Kong and Mooi Choo Chuah},\nyear={2017},\nurl={https://openreview.net/forum?id=SkwSJ99ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkwSJ99ex", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uOYN-YnYexIJ:scholar.google.com/&scioq=DeepRebirth:+A+General+Approach+for+Accelerating+Deep+Neural+Network+Execution+on+Mobile+Devices&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkxKPDv5xl", "title": "SampleRNN: An Unconditional End-to-End Neural Audio Generation Model", "track": "main", "status": "Poster", "tldr": "Novel model for unconditional audio generation task using hierarchical multi-scale RNNs and autoregressive MLP.", "abstract": "In this paper we propose a novel model for unconditional audio generation task that generates one audio sample at a time. We show that our model which profits from combining memory-less modules, namely autoregressive multilayer perceptron, and stateful recurrent neural networks in a hierarchical structure is de facto powerful to capture the underlying sources of variations in temporal domain for very long time on three datasets of different nature. Human evaluation on the generated samples indicate that our model is preferred over competing models. We also show how each component of the model contributes to the exhibited performance.", "keywords": "Speech;Deep learning;Unsupervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Soroush Mehri;Kundan Kumar;Ishaan Gulrajani;Rithesh Kumar;Shubham Jain;Jose Sotelo;Aaron Courville;Yoshua Bengio", "authorids": "soroush.mehri@umontreal.ca;kundankumar2510@gmail.com;igul222@gmail.com;ritheshkumar.95@gmail.com;shubhamjain1310@gmail.com;rdz.sotelo@gmail.com;aaron.courville@umontreal.ca;yoshua.bengio@umontreal.ca", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nmehri2017samplernn,\ntitle={Sample{RNN}: An Unconditional End-to-End Neural Audio Generation Model},\nauthor={Soroush Mehri and Kundan Kumar and Ishaan Gulrajani and Rithesh Kumar and Shubham Jain and Jose Sotelo and Aaron Courville and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SkxKPDv5xl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkxKPDv5xl", "pdf_size": 0, "rating": "8;8;9", "confidence": "3;4;4", "rating_avg": 8.333333333333334, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 8, "corr_rating_confidence": 0.5, "gs_citation": 761, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18296195672519025121&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "SkyQWDcex", "title": "A Context-aware Attention Network for Interactive Question Answering", "track": "main", "status": "Reject", "tldr": "A self-adaptive QA model aware of what it knows and what it does not know for interactive question answering.", "abstract": "We develop a new model for Interactive Question Answering (IQA), using Gated-Recurrent-Unit recurrent networks (GRUs) as encoders for statements and questions, and another GRU as a decoder for outputs. Distinct from previous work, our approach employs context-dependent word-level attention for more accurate statement representations and question-guided sentence-level attention for better context modeling. Employing these mechanisms, our model accurately understands when it can output an answer or when it requires generating a supplementary question for additional input. When available, user's feedback is encoded and directly applied to update sentence-level attention to infer the answer. Extensive experiments on QA and IQA datasets demonstrate quantitatively the effectiveness of our model with significant improvement over conventional QA models.", "keywords": "Deep learning;Natural language processing;Applications", "primary_area": "", "supplementary_material": "", "author": "Huayu Li;Martin Renqiang Min;Yong Ge;Asim Kadav", "authorids": "hli38@uncc.edu;renqiang@nec-labs.com;yongge@email.arizona.edu;asim@nec-labs.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2017a,\ntitle={A Context-aware Attention Network for Interactive Question Answering},\nauthor={Huayu Li and Martin Renqiang Min and Yong Ge and Asim Kadav},\nyear={2017},\nurl={https://openreview.net/forum?id=SkyQWDcex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkyQWDcex", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2345455859771892646&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "Sy1rwtKxg", "title": "Parallel Stochastic Gradient Descent with Sound Combiners", "track": "main", "status": "Reject", "tldr": "This paper proposes SymSGD, a parallel SGD algorithm that retains the sequential semantics of SGD in expectation.", "abstract": "Stochastic gradient descent (SGD) is a well-known method for regression and classification tasks. However, it is an inherently sequential algorithm \u2014 at each step, the processing of the current example depends on the parameters learned from the previous examples. Prior approaches to parallelizing SGD, such as Hogwild! and AllReduce, do not honor these dependences across threads and thus can potentially suffer poor convergence rates and/or poor scalability. This paper proposes SymSGD, a parallel SGD algorithm that retains the sequential semantics of SGD in expectation. Each thread in this approach learns a local model and a probabilistic model combiner that allows the local models to be combined to produce the same result as what a sequential SGD would have produced, in expectation. This SymSGD approach is applicable to any linear learner whose update rule is linear. This paper evaluates SymSGD\u2019s accuracy and performance on 9 datasets on a shared-memory machine shows up-to 13\u00d7 speedup over our heavily optimized sequential baseline on 16 cores.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Saeed Maleki;Madanlal Musuvathi;Todd Mytkowicz;Yufei Ding", "authorids": "saemal@microsoft.com;madanm@microsoft.com;toddm@microsoft.com;yding8@ncsu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmaleki2017parallel,\ntitle={Parallel Stochastic Gradient Descent with Sound Combiners},\nauthor={Saeed Maleki and Madanlal Musuvathi and Todd Mytkowicz and Yufei Ding},\nyear={2017},\nurl={https://openreview.net/forum?id=Sy1rwtKxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Sy1rwtKxg", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;5;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.666666666666667, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2578088658838082396&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 3 }, { "id": "Sy2fzU9gl", "title": "beta-VAE: Learning Basic Visual Concepts with a Constrained Variational Framework", "track": "main", "status": "Poster", "tldr": "We introduce beta-VAE, a new state-of-the-art framework for automated discovery of interpretable factorised latent representations from raw image data in a completely unsupervised manner.", "abstract": "Learning an interpretable factorised representation of the independent data generative factors of the world without supervision is an important precursor for the development of artificial intelligence that is able to learn and reason in the same way that humans do. We introduce beta-VAE, a new state-of-the-art framework for automated discovery of interpretable factorised latent representations from raw image data in a completely unsupervised manner. Our approach is a modification of the variational autoencoder (VAE) framework. We introduce an adjustable hyperparameter beta that balances latent channel capacity and independence constraints with reconstruction accuracy. We demonstrate that beta-VAE with appropriately tuned\f beta > 1 qualitatively outperforms VAE (beta = 1), as well as state of the art unsupervised (InfoGAN) and semi-supervised (DC-IGN) approaches to disentangled factor learning on a variety of datasets (celebA, faces and chairs). Furthermore, we devise a protocol to quantitatively compare the degree of disentanglement learnt by different models, and show that our approach also significantly outperforms all baselines quantitatively. Unlike InfoGAN, beta-VAE is stable to train, makes few assumptions about the data and relies on tuning a single hyperparameter, which can be directly optimised through a hyper parameter search using weakly labelled data or through heuristic visual inspection for purely unsupervised data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Irina Higgins;Loic Matthey;Arka Pal;Christopher Burgess;Xavier Glorot;Matthew Botvinick;Shakir Mohamed;Alexander Lerchner", "authorids": "irinah@google.com;lmatthey@google.com;arkap@google.com;cpburgess@google.com;glorotx@google.com;botvinick@google.com;shakir@google.com;lerchner@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nhiggins2017betavae,\ntitle={beta-{VAE}: Learning Basic Visual Concepts with a Constrained Variational Framework},\nauthor={Irina Higgins and Loic Matthey and Arka Pal and Christopher Burgess and Xavier Glorot and Matthew Botvinick and Shakir Mohamed and Alexander Lerchner},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Sy2fzU9gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Sy2fzU9gl", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 6129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14509618796321811955&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "Sy4tzwqxe", "title": "Two Methods for Wild Variational Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "Variational inference provides a powerful tool for approximate probabilistic inference on complex, structured models. Typical variational inference methods, however, require to use inference networks with computationally tractable probability density functions. This largely limits the design and implementation of variational inference methods. We consider wild variational inference methods that do not require tractable density functions on the inference networks, and hence can be applied in more challenging cases. As an example of application, we treat stochastic gradient Langevin dynamics (SGLD) as an inference network, and use our methods to automatically adjust the step sizes of SGLD to maximize its convergence speed, significantly outperforming the hand-designed step size schemes.", "keywords": "Theory", "primary_area": "", "supplementary_material": "", "author": "Qiang Liu;Yihao Feng", "authorids": "qiang.liu@dartmouth.edu;yihao.feng.gr@dartmouth.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nliu2017two,\ntitle={Two Methods for Wild Variational Inference},\nauthor={Qiang Liu and Yihao Feng},\nyear={2017},\nurl={https://openreview.net/forum?id=Sy4tzwqxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Sy4tzwqxe", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16399587950584623222&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Sy6iJDqlx", "title": "Attend, Adapt and Transfer: Attentive Deep Architecture for Adaptive Transfer from multiple sources in the same domain", "track": "main", "status": "Poster", "tldr": "We propose a general architecture for transfer that can avoid negative transfer and transfer selectively from multiple source tasks in the same domain.", "abstract": "Transferring knowledge from prior source tasks in solving a new target task can be useful in several learning applications. The application of transfer poses two serious challenges which have not been adequately addressed. First, the agent should be able to avoid negative transfer, which happens when the transfer hampers or slows down the learning instead of helping it. Second, the agent should be able to selectively transfer, which is the ability to select and transfer from different and multiple source tasks for different parts of the state space of the target task. We propose A2T (Attend Adapt and Transfer), an attentive deep architecture which adapts and transfers from these source tasks. Our model is generic enough to effect transfer of either policies or value functions. Empirical evaluations on different learning algorithms show that A2T is an effective architecture for transfer by being able to avoid negative transfer while transferring selectively from multiple source tasks in the same domain.", "keywords": "Deep learning;Reinforcement Learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Janarthanan Rajendran;Aravind Lakshminarayanan;Mitesh M. Khapra;Prasanna P;Balaraman Ravindran", "authorids": "rjana@umich.edu;aravindsrinivas@gmail.com;miteshk@cse.iitm.ac.in;prasanna.p@cs.mcgill.ca;ravi@cse.iitm.ac.in", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nrajendran2017attend,\ntitle={Attend, Adapt and Transfer: Attentive Deep Architecture for Adaptive Transfer from multiple sources in the same domain},\nauthor={Janarthanan Rajendran and Aravind Lakshminarayanan and Mitesh M. Khapra and Prasanna P and Balaraman Ravindran},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Sy6iJDqlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=Sy6iJDqlx", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;3", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9494526501185693853&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "Sy7m72Ogg", "title": "An Actor-critic Algorithm for Learning Rate Learning", "track": "main", "status": "Reject", "tldr": "We propose an algorithm to automatically learn learning rates using actor-critic methods from reinforcement learning.", "abstract": "Stochastic gradient descent (SGD), which updates the model parameters by adding a local gradient times a learning rate at each step, is widely used in model training of machine learning algorithms such as neural networks. It is observed that the models trained by SGD are sensitive to learning rates and good learning rates are problem specific. To avoid manually searching of learning rates, which is tedious and inefficient, we propose an algorithm to automatically learn learning rates using actor-critic methods from reinforcement learning (RL). In particular, we train a policy network called actor to decide the learning rate at each step during training, and a value network called critic to give feedback about quality of the decision (e.g., the goodness of the learning rate outputted by the actor) that the actor made. Experiments show that our method leads to good convergence of SGD and can prevent overfitting to a certain extent, resulting in better performance than human-designed competitors.", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Chang Xu;Tao Qin;Gang Wang;Tie-Yan Liu", "authorids": "changxu@nbjl.nankai.edu.cn;taoqin@microsoft.com;wgzwp@nbjl.nankai.edu.cn;tie-yan.liu@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nxu2017an,\ntitle={An Actor-critic Algorithm for Learning Rate Learning},\nauthor={Chang Xu and Tao Qin and Gang Wang and Tie-Yan Liu},\nyear={2017},\nurl={https://openreview.net/forum?id=Sy7m72Ogg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Sy7m72Ogg", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2941631752644900627&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Sy8gdB9xx", "title": "Understanding deep learning requires rethinking generalization", "track": "main", "status": "Oral", "tldr": "Through extensive systematic experiments, we show how the traditional approaches fail to explain why large neural networks generalize well in practice, and why understanding deep learning requires rethinking generalization.", "abstract": "Despite their massive size, successful deep artificial neural networks can\nexhibit a remarkably small difference between training and test performance.\nConventional wisdom attributes small generalization error either to properties\nof the model family, or to the regularization techniques used during training.\n\nThrough extensive systematic experiments, we show how these traditional\napproaches fail to explain why large neural networks generalize well in\npractice. Specifically, our experiments establish that state-of-the-art\nconvolutional networks for image classification trained with stochastic\ngradient methods easily fit a random labeling of the training data. This\nphenomenon is qualitatively unaffected by explicit regularization, and occurs\neven if we replace the true images by completely unstructured random noise. We\ncorroborate these experimental findings with a theoretical construction\nshowing that simple depth two neural networks already have perfect finite\nsample expressivity as soon as the number of parameters exceeds the\nnumber of data points as it usually does in practice.\n\nWe interpret our experimental findings by comparison with traditional models.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Chiyuan Zhang;Samy Bengio;Moritz Hardt;Benjamin Recht;Oriol Vinyals", "authorids": "chiyuan@mit.edu;bengio@google.com;mrtz@google.com;brecht@berkeley.edu;vinyals@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzhang2017understanding,\ntitle={Understanding deep learning requires rethinking generalization},\nauthor={Chiyuan Zhang and Samy Bengio and Moritz Hardt and Benjamin Recht and Oriol Vinyals},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Sy8gdB9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Sy8gdB9xx", "pdf_size": 0, "rating": "9;10;10", "confidence": "3;4;4", "rating_avg": 9.666666666666666, "confidence_avg": 3.6666666666666665, "replies_avg": 53, "authors#_avg": 5, "corr_rating_confidence": 1.0, "gs_citation": 5522, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4613672282544622621&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 28 }, { "id": "SyCSsUDee", "title": "Semantic Noise Modeling for Better Representation Learning", "track": "main", "status": "Reject", "tldr": "A novel latent space modeling method to learn better representation", "abstract": "Latent representation learned from multi-layered neural networks via hierarchical feature abstraction enables recent success of deep learning. Under the deep learning framework, generalization performance highly depends on the learned latent representation. In this work, we propose a novel latent space modeling method to learn better latent representation. We designed a neural network model based on the assumption that good base representation for supervised tasks can be attained by maximizing the sum of hierarchical mutual informations between the input, latent, and output variables. From this base model, we introduce a semantic noise modeling method which enables semantic perturbation on the latent space to enhance the representational power of learned latent feature. During training, latent vector representation can be stochastically perturbed by a modeled additive noise while preserving its original semantics. It implicitly brings the effect of semantic augmentation on the latent space. The proposed model can be easily learned by back-propagation with common gradient-based optimization algorithms. Experimental results show that the proposed method helps to achieve performance benefits against various previous approaches. We also provide the empirical analyses for the proposed latent space modeling method including t-SNE visualization.", "keywords": "Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Hyo-Eun Kim;Sangheum Hwang;Kyunghyun Cho", "authorids": "hekim@lunit.io;shwang@lunit.io;kyunghyun.cho@nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkim2017semantic,\ntitle={Semantic Noise Modeling for Better Representation Learning},\nauthor={Hyo-Eun Kim and Sangheum Hwang and Kyunghyun Cho},\nyear={2017},\nurl={https://openreview.net/forum?id=SyCSsUDee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SyCSsUDee", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xCKU4F3RnRwJ:scholar.google.com/&scioq=Semantic+Noise+Modeling+for+Better+Representation+Learning&hl=en&as_sdt=0,33", "gs_version_total": 3 }, { "id": "SyEiHNKxx", "title": "A Differentiable Physics Engine for Deep Learning in Robotics", "track": "main", "status": "Workshop", "tldr": "We wrote a framework to differentiate through physics and show that this makes training deep learned controllers for robotics remarkably fast and straightforward", "abstract": "One of the most important fields in robotics is the optimization of controllers. Currently, robots are often treated as a black box in this optimization process, which is the reason why derivative-free optimization methods such as evolutionary algorithms or reinforcement learning are omnipresent. When gradient-based methods are used, models are kept small or rely on finite difference approximations for the Jacobian. This method quickly grows expensive with increasing numbers of parameters, such as found in deep learning. We propose an implementation of a modern physics engine, which can differentiate control parameters. This engine is implemented for both CPU and GPU. Firstly, this paper shows how such an engine speeds up the optimization process, even for small problems. Furthermore, it explains why this is an alternative approach to deep Q-learning, for using deep learning in robotics. Finally, we argue that this is a big step for deep learning in robotics, as it opens up new possibilities to optimize robots, both in hardware and software.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Jonas Degrave;Michiel Hermans;Joni Dambre;Francis wyffels", "authorids": "Jonas.Degrave@UGent.be;x@UGent.be;Joni.Dambre@UGent.be;Francis.wyffels@UGent.be", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SyEiHNKxx", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;2;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 274, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6535318790190529013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15 }, { "id": "SyJNmVqgg", "title": "Neural Data Filter for Bootstrapping Stochastic Gradient Descent", "track": "main", "status": "Workshop", "tldr": "We propose a reinforcement learning based teacher-student framework for filtering training data to boost SGD convergence.", "abstract": "Mini-batch based Stochastic Gradient Descent(SGD) has been widely used to train deep neural networks efficiently. In this paper, we design a general framework to automatically and adaptively select training data for SGD. The framework is based on neural networks and we call it \\emph{\\textbf{N}eural \\textbf{D}ata \\textbf{F}ilter} (\\textbf{NDF}). In Neural Data Filter, the whole training process of the original neural network is monitored and supervised by a deep reinforcement network, which controls whether to filter some data in sequentially arrived mini-batches so as to maximize future accumulative reward (e.g., validation accuracy). The SGD process accompanied with NDF is able to use less data and converge faster while achieving comparable accuracy as the standard SGD trained on the full dataset. Our experiments show that NDF bootstraps SGD training for different neural network models including Multi Layer Perceptron Network and Recurrent Neural Network trained on various types of tasks including image classification and text understanding.", "keywords": "Reinforcement Learning;Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Yang Fan;Fei Tian;Tao Qin;Tie-Yan Liu", "authorids": "v-yanfa@microsoft.com;fetia@microsoft.com;taoqin@microsoft.com;tie-yan.liu@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SyJNmVqgg", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;4;0", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844387, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2420784274028731255&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "SyK00v5xx", "title": "A Simple but Tough-to-Beat Baseline for Sentence Embeddings", "track": "main", "status": "Poster", "tldr": "A simple unsupervised method for sentence embedding that can get results comparable to sophisticated models like RNN's and LSTM's", "abstract": "\nThe success of neural network methods for computing word embeddings has motivated methods for generating semantic embeddings of longer pieces of text, such as sentences and paragraphs. Surprisingly, Wieting et al (ICLR'16) showed that such complicated methods are outperformed, especially in out-of-domain (transfer learning) settings, by simpler methods involving mild retraining of word embeddings and basic linear regression. The method of Wieting et al. requires retraining with a substantial labeled dataset such as Paraphrase Database (Ganitkevitch et al., 2013). \n\nThe current paper goes further, showing that the following completely unsupervised sentence embedding is a formidable baseline: Use word embeddings computed using one of the popular methods on unlabeled corpus like Wikipedia, represent the sentence by a weighted average of the word vectors, and then modify them a bit using PCA/SVD. This weighting improves performance by about 10% to 30% in textual similarity tasks, and beats sophisticated supervised methods including RNN's and LSTM's. It even improves Wieting et al.'s embeddings. \n This simple method should be used as the baseline to beat in future, especially when labeled training data is scarce or nonexistent. \n\nThe paper also gives a theoretical explanation of the success of the above unsupervised method using a latent variable generative model for sentences, which is a simple extension of the model in Arora et al. (TACL'16) with new \"smoothing\" terms that allow for \nwords occurring out of context, as well as high probabilities for words like and, not in all contexts. ", "keywords": "Natural language processing;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Sanjeev Arora;Yingyu Liang;Tengyu Ma", "authorids": "arora@cs.princeton.edu;yingyul@cs.princeton.edu;tengyu@cs.princeton.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\narora2017a,\ntitle={A Simple but Tough-to-Beat Baseline for Sentence Embeddings},\nauthor={Sanjeev Arora and Yingyu Liang and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SyK00v5xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyK00v5xx", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 19, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1817, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11920872790962895964&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "SyOvg6jxx", "title": "#Exploration: A Study of Count-Based Exploration for Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We improve exploration in deep reinforcement learning by simply hashing states and assigning bonus rewards according to state counts.", "abstract": "Count-based exploration algorithms are known to perform near-optimally when used in conjunction with tabular reinforcement learning (RL) methods for solving small discrete Markov decision processes (MDPs). It is generally thought that count-based methods cannot be applied in high-dimensional state spaces, since most states will only occur once.\nRecent deep RL exploration strategies are able to deal with high-dimensional continuous state spaces through complex heuristics, often relying on optimism in the face of uncertainty or intrinsic motivation. \n\nIn this work, we describe a surprising finding: a simple generalization of the classic count-based approach can reach near state-of-the-art performance on various high-dimensional and/or continuous deep RL benchmarks. States are mapped to hash codes, which allows to count their occurrences with a hash table. These counts are then used to compute a reward bonus according to the classic count-based exploration theory. We find that simple hash functions can achieve surprisingly good results on many challenging tasks. Furthermore, we show that a domain-dependent learned hash code may further improve these results.\n\nDetailed analysis reveals important aspects of a good hash function: 1) having appropriate granularity and 2) encoding information relevant to solving the MDP. This exploration strategy achieves near state-of-the-art performance on both continuous control tasks and Atari 2600 games, hence providing a simple yet powerful baseline for solving MDPs that require considerable exploration.", "keywords": "Deep learning;Reinforcement Learning;Games", "primary_area": "", "supplementary_material": "", "author": "Haoran Tang;Rein Houthooft;Davis Foote;Adam Stooke;Xi Chen;Yan Duan;John Schulman;Filip De Turck;Pieter Abbeel", "authorids": "hrtang.alex@berkeley.edu;rein.houthooft@ugent.be;djfoote@berkeley.edu;adam.stooke@berkeley.edu;peter@openai.com;rocky@openai.com;joschu@openai.com;filip.deturck@ugent.be;pieter@openai.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@misc{\ntang2017exploration,\ntitle={\\#Exploration: A Study of Count-Based Exploration for Deep Reinforcement Learning},\nauthor={Haoran Tang and Rein Houthooft and Davis Foote and Adam Stooke and Xi Chen and Yan Duan and John Schulman and Filip De Turck and Pieter Abbeel},\nyear={2017},\nurl={https://openreview.net/forum?id=SyOvg6jxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SyOvg6jxx", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 15, "authors#_avg": 9, "corr_rating_confidence": 0.9449111825230683, "gs_citation": 774, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7719009312505331345&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "SyQq185lg", "title": "Latent Sequence Decompositions", "track": "main", "status": "Poster", "tldr": "", "abstract": "Sequence-to-sequence models rely on a fixed decomposition of the target sequences into a sequence of tokens that may be words, word-pieces or characters. The choice of these tokens and the decomposition of the target sequences into a sequence of tokens is often static, and independent of the input, output data domains. This can potentially lead to a sub-optimal choice of token dictionaries, as the decomposition is not informed by the particular problem being solved. In this paper we present Latent Sequence Decompositions (LSD), a framework in which the decomposition of sequences into constituent tokens is learnt during the training of the model. The decomposition depends both on the input sequence and on the output sequence. In LSD, during training, the model samples decompositions incrementally, from left to right by locally sampling between valid extensions. We experiment with the Wall Street Journal speech recognition task. Our LSD model achieves 12.9% WER compared to a character baseline of 14.8% WER. When combined with a convolutional network on the encoder, we achieve a WER of 9.6%.\n", "keywords": "Speech;Applications;Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "William Chan;Yu Zhang;Quoc Le;Navdeep Jaitly", "authorids": "williamchan@cmu.edu;yzhang87@mit.edu;qvl@google.com;ndjaitly@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchan2017latent,\ntitle={Latent Sequence Decompositions},\nauthor={William Chan and Yu Zhang and Quoc Le and Navdeep Jaitly},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SyQq185lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer6", "site": "https://openreview.net/forum?id=SyQq185lg", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;5;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10658053814036023464&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SyVVJ85lg", "title": "Paleo: A Performance Model for Deep Neural Networks", "track": "main", "status": "Poster", "tldr": "Paleo: An analytical performance model for exploring the space of scalable deep learning systems and quickly diagnosing their effectiveness for a given problem instance.", "abstract": "Although various scalable deep learning software packages have been proposed, it remains unclear how to best leverage parallel and distributed computing infrastructure to accelerate their training and deployment. Moreover, the effectiveness of existing parallel and distributed systems varies widely based on the neural network architecture and dataset under consideration. In order to efficiently explore the space of scalable deep learning systems and quickly diagnose their effectiveness for a given problem instance, we introduce an analytical performance model called Paleo. Our key observation is that a neural network architecture carries with it a declarative specification of the computational requirements associated with its training and evaluation. By extracting these requirements from a given architecture and mapping them to a specific point within the design space of software, hardware and communication strategies, Paleo can efficiently and accurately model the expected scalability and performance of a putative deep learning system. We show that Paleo is robust to the choice of network architecture, hardware, software, communication schemes, and parallelization strategies. We further demonstrate its ability to accurately model various recently published scalability results for CNNs such as NiN, Inception and AlexNet.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Hang Qi;Evan R. Sparks;Ameet Talwalkar", "authorids": "hangqi@cs.ucla.edu;sparks@cs.berkeley.edu;ameet@cs.ucla.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nqi2017paleo,\ntitle={Paleo: A Performance Model for Deep Neural Networks},\nauthor={Hang Qi and Evan R. Sparks and Ameet Talwalkar},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SyVVJ85lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyVVJ85lg", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 254, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10999423042570044777&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "SyW2QSige", "title": "Towards Information-Seeking Agents", "track": "main", "status": "Reject", "tldr": "We investigate the behavior of models trained to answer questions by asking sequences of simple questions.", "abstract": "We develop a general problem setting for training and testing the ability of agents to gather information efficiently. Specifically, we present a collection of tasks in which success requires searching through a partially-observed environment, for fragments of information which can be pieced together to accomplish various goals. We combine deep architectures with techniques from reinforcement learning to develop agents that solve our tasks. We shape the behavior of these agents by combining extrinsic and intrinsic rewards. We empirically demonstrate that these agents learn to search actively and intelligently for new information to reduce their uncertainty, and to exploit information they have already acquired.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Philip Bachman;Alessandro Sordoni;Adam Trischler", "authorids": "phil.bachman@maluuba.com;alessandro.sordoni@maluuba.com;adam.trischler@maluuba.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbachman2017towards,\ntitle={Towards Information-Seeking Agents},\nauthor={Philip Bachman and Alessandro Sordoni and Adam Trischler},\nyear={2017},\nurl={https://openreview.net/forum?id=SyW2QSige}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyW2QSige", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=849368256242320069&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SyWvgP5el", "title": "EPOpt: Learning Robust Neural Network Policies Using Model Ensembles", "track": "main", "status": "Poster", "tldr": "An ensemble optimization approach to help transfer neural network policies from simulated domains to real-world target domains.", "abstract": "Sample complexity and safety are major challenges when learning policies with reinforcement learning for real-world tasks, especially when the policies are represented using rich function approximators like deep neural networks. Model-based methods where the real-world target domain is approximated using a simulated source domain provide an avenue to tackle the above challenges by augmenting real data with simulated data. However, discrepancies between the simulated source domain and the target domain pose a challenge for simulated training. We introduce the EPOpt algorithm, which uses an ensemble of simulated source domains and a form of adversarial training to learn policies that are robust and generalize to a broad range of possible target domains, including to unmodeled effects. Further, the probability distribution over source domains in the ensemble can be adapted using data from the target domain and approximate Bayesian methods, to progressively make it a better approximation. Thus, learning on a model ensemble, along with source domain adaptation, provides the benefit of both robustness and learning.", "keywords": "Reinforcement Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Aravind Rajeswaran;Sarvjeet Ghotra;Balaraman Ravindran;Sergey Levine", "authorids": "aravraj@cs.washington.edu;sarvjeet.13it236@nitk.edu.in;ravi@cse.iitm.ac.in;svlevine@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nrajeswaran2017epopt,\ntitle={{EPO}pt: Learning Robust Neural Network Policies Using Model Ensembles},\nauthor={Aravind Rajeswaran and Sarvjeet Ghotra and Balaraman Ravindran and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SyWvgP5el}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SyWvgP5el", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;0;4", "rating_avg": 7.333333333333333, "confidence_avg": 2.6666666666666665, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 439, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14747920778535129414&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SyZprb5xg", "title": "On Robust Concepts and Small Neural Nets", "track": "main", "status": "Workshop", "tldr": "an efficient analog of the universal approximation theorem for neural networks over the boolean hypercube", "abstract": "The universal approximation theorem for neural networks says that any reasonable function is well-approximated by a two-layer neural network with sigmoid gates but it does not provide good bounds on the number of hidden-layer nodes or the weights. However, robust concepts often have small neural networks in practice. We show an efficient analog of the universal approximation theorem on the boolean hypercube in this context.\n\nWe prove that any noise-stable boolean function on n boolean-valued input variables can be well-approximated by a two-layer linear threshold circuit with a small number of hidden-layer nodes and small weights, that depend only on the noise-stability and approximation parameters, and are independent of n. We also give a polynomial time learning algorithm that outputs a small two-layer linear threshold circuit that approximates such a given function. We also show weaker generalizations of this to noise-stable polynomial threshold functions and noise-stable boolean functions in general.", "keywords": "Theory", "primary_area": "", "supplementary_material": "", "author": "Amit Deshpande;Sushrut Karmalkar", "authorids": "amitdesh@microsoft.com;sushrutk@cs.utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SyZprb5xg", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;2", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hZiYio0qwYgJ:scholar.google.com/&scioq=On+Robust+Concepts+and+Small+Neural+Nets&hl=en&as_sdt=0,33", "gs_version_total": 5 }, { "id": "Syfkm6cgx", "title": "Improving Invariance and Equivariance Properties of Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "Data augmentation shapes internal network representation and makes predictions robust to input transformations.", "abstract": "Convolutional Neural Networks (CNNs) learn highly discriminative representations from data, but how robust and structured are these representations? How does the data shape the internal network representation? We shed light on these questions by empirically measuring the invariance and equivariance properties of a large number of CNNs trained with various types of input transformations. We find that CNNs learn invariance wrt all 9 tested transformation types and that invariance extends to transformations outside the training range. We also measure the distance between CNN representations and show that similar input transformations lead to more similar internal representations. Transforms can be grouped by the way they affect the learned representation. Additionally, we also propose a loss function that aims to improve CNN equivariance.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Christopher Tensmeyer;Tony Martinez", "authorids": "tensmeyer@byu.edu;martinez@cs.byu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntensmeyer2017improving,\ntitle={Improving Invariance and Equivariance Properties of Convolutional Neural Networks},\nauthor={Christopher Tensmeyer and Tony Martinez},\nyear={2017},\nurl={https://openreview.net/forum?id=Syfkm6cgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Syfkm6cgx", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14927926602116915115&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SygGlIBcel", "title": "Opening the vocabulary of neural language models with character-level word representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper introduces an architecture for an open-vocabulary neural language model. Word representations are computed on-the-fly by a convolution network followed by pooling layer. This allows the model to consider any word, in the context or for the prediction. The training objective is derived from the Noise-Contrastive Estimation to circumvent the lack of vocabulary. We test the ability of our model to build representations of unknown words on the MT task of IWSLT-2016 from English to Czech, in a reranking setting. Experimental results show promising results, with a gain up to 0.7 BLEU point. They also emphasize the difficulty and instability when training such models with character-based representations for the predicted words.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Matthieu Labeau;Alexandre Allauzen", "authorids": "labeau@limsi.fr;allauzen@limsi.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlabeau2017opening,\ntitle={Opening the vocabulary of neural language models with character-level word representations},\nauthor={Matthieu Labeau and Alexandre Allauzen},\nyear={2017},\nurl={https://openreview.net/forum?id=SygGlIBcel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SygGlIBcel", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 20, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9764472349975873633&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SygvTcYee", "title": "ParMAC: distributed optimisation of nested functions, with application to binary autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many powerful machine learning models are based on the composition of multiple processing layers, such as deep nets, which gives rise to nonconvex objective functions. A general, recent approach to optimise such \"nested\" functions is the \"method of auxiliary coordinates (MAC)\". MAC introduces an auxiliary coordinate for each data point in order to decouple the nested model into independent submodels. This decomposes the optimisation into steps that alternate between training single layers and updating the coordinates. It has the advantage that it reuses existing single-layer algorithms, introduces parallelism, and does not need to use chain-rule gradients, so it works with nondifferentiable layers. We describe ParMAC, a distributed-computation model for MAC. This trains on a dataset distributed across machines while limiting the amount of communication so it does not obliterate the benefit of parallelism. ParMAC works on a cluster of machines with a circular topology and alternates two steps until convergence: one step trains the submodels in parallel using stochastic updates, and the other trains the coordinates in parallel. Only submodel parameters, no data or coordinates, are ever communicated between machines. ParMAC exhibits high parallelism, low communication overhead, and facilitates data shuffling, load balancing, fault tolerance and streaming data processing. We study the convergence of ParMAC and its parallel speedup, and implement ParMAC using MPI to learn binary autoencoders for fast image retrieval, achieving nearly perfect speedups in a 128-processor cluster with a training set of 100 million high-dimensional points.\n", "keywords": "Optimization;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Miguel A. Carreira-Perpinan;Mehdi Alizadeh", "authorids": "mcarreira-perpinan@ucmerced.edu;malizadeh@ucmerced.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ncarreira-perpinan2017parmac,\ntitle={Par{MAC}: distributed optimisation of nested functions, with application to binary autoencoders},\nauthor={Miguel A. Carreira-Perpinan and Mehdi Alizadeh},\nyear={2017},\nurl={https://openreview.net/forum?id=SygvTcYee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SygvTcYee", "pdf_size": 0, "rating": "4;5;6;6", "confidence": "2;4;4;4", "rating_avg": 5.25, "confidence_avg": 3.5, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 0.8703882797784891, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ei92wXyARrYJ:scholar.google.com/&scioq=ParMAC:+distributed+optimisation+of+nested+functions,+with+application+to+binary+autoencoders&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "Syoiqwcxx", "title": "Local minima in training of deep networks", "track": "main", "status": "Reject", "tldr": "As a contribution to the discussion about error surface and the question why \"deep and cheap\" learning works so well we present concrete examples of local minima and obstacles arising in the training of deep models.", "abstract": "There has been a lot of recent interest in trying to characterize the error surface of deep models. This stems from a long standing question. Given that deep networks are highly nonlinear systems optimized by local gradient methods, why do they not seem to be affected by bad local minima? It is widely believed that training of deep models using gradient methods works so well because the error surface either has no local minima, or if they exist they need to be close in value to the global minimum. It is known that such results hold under strong assumptions which are not satisfied by real models. In this paper we present examples showing that for such theorem to be true additional assumptions on the data, initialization schemes and/or the model classes have to be made. We look at the particular case of finite size datasets. We demonstrate that in this scenario one can construct counter-examples (datasets or initialization schemes) when the network does become susceptible to bad local minima over the weight space.", "keywords": "Theory;Deep learning;Supervised Learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Grzegorz Swirszcz;Wojciech Marian Czarnecki;Razvan Pascanu", "authorids": "swirszcz@google.com;lejlot@google.com;razp@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nswirszcz2017local,\ntitle={Local minima in training of deep networks},\nauthor={Grzegorz Swirszcz and Wojciech Marian Czarnecki and Razvan Pascanu},\nyear={2017},\nurl={https://openreview.net/forum?id=Syoiqwcxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Syoiqwcxx", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17507493781170374595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SypU81Ole", "title": "Sampling Generative Networks", "track": "main", "status": "Reject", "tldr": "Demonstrates improved techniques for interpolation and deriving + evaluating attribute vectors in latent spaces applicable to both VAE and GAN models.", "abstract": "We introduce several techniques for sampling and visualizing the latent spaces of generative models. Replacing linear interpolation with spherical linear interpolation prevents diverging from a model's prior distribution and produces sharper samples. J-Diagrams and MINE grids are introduced as visualizations of manifolds created by analogies and nearest neighbors. We demonstrate two new techniques for deriving attribute vectors: bias-corrected vectors with data replication and synthetic vectors with data augmentation. Binary classification using attribute vectors is presented as a technique supporting quantitative analysis of the latent space. Most techniques are intended to be independent of model type and examples are shown on both Variational Autoencoders and Generative Adversarial Networks.\n", "keywords": "Unsupervised Learning;Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Tom White", "authorids": "tom.white@vuw.ac.nz", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nwhite2017sampling,\ntitle={Sampling Generative Networks},\nauthor={Tom White},\nyear={2017},\nurl={https://openreview.net/forum?id=SypU81Ole}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SypU81Ole", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 19, "authors#_avg": 1, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 322, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=956039579339273684&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "Sys6GJqxl", "title": "Delving into Transferable Adversarial Examples and Black-box Attacks", "track": "main", "status": "Poster", "tldr": "", "abstract": "An intriguing property of deep neural networks is the existence of adversarial examples, which can transfer among different architectures. These transferable adversarial examples may severely hinder deep neural network-based applications. Previous works mostly study the transferability using small scale datasets. In this work, we are the first to conduct an extensive study of the transferability over large models and a large scale dataset, and we are also the first to study the transferability of targeted adversarial examples with their target labels. We study both non-targeted and targeted adversarial examples, and show that while transferable non-targeted adversarial examples are easy to find, targeted adversarial examples generated using existing approaches almost never transfer with their target labels. Therefore, we propose novel ensemble-based approaches to generating transferable adversarial examples. Using such approaches, we observe a large proportion of targeted adversarial examples that are able to transfer with their target labels for the first time. We also present some geometric studies to help understanding the transferable adversarial examples. Finally, we show that the adversarial examples generated using ensemble-based approaches can successfully attack Clarifai.com, which is a black-box image classification system.", "keywords": "Computer vision;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Yanpei Liu;Xinyun Chen;Chang Liu;Dawn Song", "authorids": "resodo.liu@gmail.com;jungyhuk@gmail.com;liuchang@eecs.berkeley.edu;dawnsong@cs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nliu2017delving,\ntitle={Delving into Transferable Adversarial Examples and Black-box Attacks},\nauthor={Yanpei Liu and Xinyun Chen and Chang Liu and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=Sys6GJqxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Sys6GJqxl", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 22, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2183, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11918479105697515542&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SywUHFcge", "title": "A Theoretical Framework for Robustness of (Deep) Classifiers against Adversarial Samples", "track": "main", "status": "Workshop", "tldr": "We propose a theoretical framework to explain and measure model robustness and harden DNN model against adversarial attacks.", "abstract": "Most machine learning classifiers, including deep neural networks, are vulnerable to adversarial examples. Such inputs are typically generated by adding small but purposeful modifications that lead to incorrect outputs while imperceptible to human eyes. The goal of this paper is not to introduce a single method, but to make theoretical steps toward fully understanding adversarial examples. By using concepts from topology, our theoretical analysis brings forth the key reasons why an adversarial example can fool a classifier ($f_1$) and adds its oracle ($f_2$, like human eyes) in such analysis. \nBy investigating the topological relationship between two (pseudo)metric spaces corresponding to predictor $f_1$ and oracle $f_2$, we develop necessary and sufficient conditions that can determine if $f_1$ is always robust (strong-robust) against adversarial examples according to $f_2$. Interestingly our theorems indicate that just one unnecessary feature can make $f_1$ not strong-robust, and the right feature representation learning is the key to getting a classifier that is both accurate and strong robust.\n", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Beilun Wang;Ji Gao;Yanjun Qi", "authorids": "bw4mw@virginia.edu;jg6yd@virginia.edu;yanjun@virginia.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SywUHFcge", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;2", "rating_avg": 4.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 39, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16756638882132842413&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "Sywh5KYex", "title": "Learning Identity Mappings with Residual Gates", "track": "main", "status": "Reject", "tldr": "This paper proposes adding simple gates to layers to make learning identity mappings trivial. It also introduces Gated Plain Networks and Gated Residual Networks.", "abstract": "We propose a layer augmentation technique that adds shortcut connections with a linear gating mechanism, and can be applied to almost any network model. By using a scalar parameter to control each gate, we provide a way to learn identity mappings by optimizing only one parameter. We build upon the motivation behind Highway Neural Networks and Residual Networks, where a layer is reformulated in order to make learning identity mappings less problematic to the optimizer. The augmentation introduces only one extra parameter per layer, and provides easier optimization by making degeneration into identity mappings simpler. Experimental results show that augmenting layers provides better optimization, increased performance, and more layer independence. We evaluate our method on MNIST using fully-connected networks, showing empirical indications that our augmentation facilitates the optimization of deep models, and that it provides high tolerance to full layer removal: the model retains over 90% of its performance even after half of its layers have been randomly removed. In our experiments, augmented plain networks -- which can be interpreted as simplified Highway Neural Networks -- outperform ResNets, raising new questions on how shortcut connections should be designed. We also evaluate our model on CIFAR-10 and CIFAR-100 using augmented Wide ResNets, achieving 3.65% and 18.27% test error, respectively.", "keywords": "Computer vision;Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Pedro H. P. Savarese;Leonardo O. Mazza;Daniel R. Figueiredo", "authorids": "savarese@land.ufrj.br;leonardomazza@poli.ufrj.br;daniel@land.ufrj.br", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsavarese2017learning,\ntitle={Learning Identity Mappings with Residual Gates},\nauthor={Pedro H. P. Savarese and Leonardo O. Mazza and Daniel R. Figueiredo},\nyear={2017},\nurl={https://openreview.net/forum?id=Sywh5KYex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Sywh5KYex", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 20, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16077670256516565465&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SyxeqhP9ll", "title": "Calibrating Energy-based Generative Adversarial Networks", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this paper, we propose to equip Generative Adversarial Networks with the ability to produce direct energy estimates for samples.\nSpecifically, we propose a flexible adversarial training framework, and prove this framework not only ensures the generator converges to the true data distribution, but also enables the discriminator to retain the density information at the global optimal.\nWe derive the analytic form of the induced solution, and analyze the properties.\nIn order to make the proposed framework trainable in practice, we introduce two effective approximation techniques.\nEmpirically, the experiment results closely match our theoretical analysis, verifying the discriminator is able to recover the energy of data distribution.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Zihang Dai;Amjad Almahairi;Philip Bachman;Eduard Hovy;Aaron Courville", "authorids": "zander.dai@gmail.com;amjadmahayri@gmail.com;phil.bachman@gmail.com;hovy@cmu.edu;aaron.courville@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ndai2017calibrating,\ntitle={Calibrating Energy-based Generative Adversarial Networks},\nauthor={Zihang Dai and Amjad Almahairi and Philip Bachman and Eduard Hovy and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=SyxeqhP9ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyxeqhP9ll", "pdf_size": 0, "rating": "7;8;8", "confidence": "5;4;4", "rating_avg": 7.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8043491942343572906&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "r10FA8Kxg", "title": "Do Deep Convolutional Nets Really Need to be Deep and Convolutional?", "track": "main", "status": "Poster", "tldr": "This paper provides the first empirical demonstration that deep convolutional models really need to be both deep and convolutional, even when trained with model distillation and heavy hyperparameter optimization.", "abstract": "Yes, they do. This paper provides the first empirical demonstration that deep convolutional models really need to be both deep and convolutional, even when trained with methods such as distillation that allow small or shallow models of high accuracy to be trained. Although previous research showed that shallow feed-forward nets sometimes can learn the complex functions previously learned by deep nets while using the same number of parameters as the deep models they mimic, in this paper we demonstrate that the same methods cannot be used to train accurate models on CIFAR-10 unless the student models contain multiple layers of convolution. Although the student models do not have to be as deep as the teacher model they mimic, the students need multiple convolutional layers to learn functions of comparable accuracy as the deep convolutional teacher. ", "keywords": "Deep learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Gregor Urban;Krzysztof J. Geras;Samira Ebrahimi Kahou;Ozlem Aslan;Shengjie Wang;Abdelrahman Mohamed;Matthai Philipose;Matt Richardson;Rich Caruana", "authorids": "gurban@uci.edu;k.j.geras@sms.ed.ac.uk;samira.ebrahimi-kahou@polymtl.ca;ozlem@cs.ualberta.ca;wangsj@cs.washington.edu;asamir@microsoft.com;matthaip@microsoft.com;mattri@microsoft.com;rcaruana@microsoft.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nurban2017do,\ntitle={Do Deep Convolutional Nets Really Need to be Deep and Convolutional?},\nauthor={Gregor Urban and Krzysztof J. Geras and Samira Ebrahimi Kahou and Ozlem Aslan and Shengjie Wang and Abdelrahman Mohamed and Matthai Philipose and Matt Richardson and Rich Caruana},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r10FA8Kxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=r10FA8Kxg", "pdf_size": 0, "rating": "7;7", "confidence": "3;4", "rating_avg": 7.0, "confidence_avg": 3.5, "replies_avg": 10, "authors#_avg": 9, "corr_rating_confidence": 0.0, "gs_citation": 299, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11234690386091662148&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "r17RD2oxe", "title": "Deep Neural Networks and the Tree of Life", "track": "main", "status": "Reject", "tldr": "Provideing a potential solution to the important problem of constructing a biology evolutionary tree; Giving insight into the representations produced by deep neural networks", "abstract": "In Evolutionary Biology, species close in the tree of evolution are identified by similar visual features. In computer vision, deep neural networks perform image classification by learning to identify similar visual features. This leads to an interesting question: is it possible to leverage the advantage of deep networks to construct a tree of life? In this paper, we make the first attempt at building the phylogenetic tree diagram by leveraging the high-level features learned by deep neural networks. Our method is based on the intuition that if two species share similar features, then their cross activations in the softmax layer should be high. Based on the deep representation of convolutional neural networks trained for image classification, we build a tree of life for species in the image categories of ImageNet. Further, for species not in the ImageNet categories that are visually similar to some category, the cosine similarity of their activation vectors in the same layer should be high. By applying the inner product similarity of the activation vectors at the last fully connected layer for different species, we can roughly build their tree of life. Our work provides a new perspective to the deep representation and sheds light on possible novel applications of deep representation to other areas like Bioinformatics.\n", "keywords": "Deep learning;Computer vision;Applications", "primary_area": "", "supplementary_material": "", "author": "Yan Wang;Kun He;John E. Hopcroft;Yu Sun", "authorids": "yanwang@hust.edu.cn;brooklet60@hust.edu.cn;jeh@cs.cornell.edu;ys646@cornell.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwang2017deep,\ntitle={Deep Neural Networks and the Tree of Life},\nauthor={Yan Wang and Kun He and John E. Hopcroft and Yu Sun},\nyear={2017},\nurl={https://openreview.net/forum?id=r17RD2oxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r17RD2oxe", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1065315861111885552&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1Aab85gg", "title": "Offline bilingual word vectors, orthogonal transformations and the inverted softmax", "track": "main", "status": "Poster", "tldr": "We show that a linear transformation between word vector spaces should be orthogonal and can be obtained analytically using the SVD, and introduce the inverted softmax for information retrieval.", "abstract": "Usually bilingual word vectors are trained \"online''. Mikolov et al. showed they can also be found \"offline\"; whereby two pre-trained embeddings are aligned with a linear transformation, using dictionaries compiled from expert knowledge. In this work, we prove that the linear transformation between two spaces should be orthogonal. This transformation can be obtained using the singular value decomposition. We introduce a novel \"inverted softmax\" for identifying translation pairs, with which we improve the precision @1 of Mikolov's original mapping from 34% to 43%, when translating a test set composed of both common and rare English words into Italian. Orthogonal transformations are more robust to noise, enabling us to learn the transformation without expert bilingual signal by constructing a \"pseudo-dictionary\" from the identical character strings which appear in both languages, achieving 40% precision on the same test set. Finally, we extend our method to retrieve the true translations of English sentences from a corpus of 200k Italian sentences with a precision @1 of 68%.", "keywords": "Natural language processing;Transfer Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Samuel L. Smith;David H. P. Turban;Steven Hamblin;Nils Y. Hammerla", "authorids": "samuel.smith@babylonhealth.com;dt382@cam.ac.uk;steven.hamblin@babylonhealth.com;nils.hammerla@babylonhealth.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsmith2017offline,\ntitle={Offline bilingual word vectors, orthogonal transformations and the inverted softmax},\nauthor={Samuel L. Smith and David H. P. Turban and Steven Hamblin and Nils Y. Hammerla},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1Aab85gg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1Aab85gg", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;5;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 642, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5636639417293949985&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "r1BJLw9ex", "title": "Adjusting for Dropout Variance in Batch Normalization and Weight Initialization", "track": "main", "status": "Reject", "tldr": "Batch Norm Incorrectly Estimates Variance When Dropout Is On", "abstract": "We show how to adjust for the variance introduced by dropout with corrections to weight initialization and Batch Normalization, yielding higher accuracy. Though dropout can preserve the expected input to a neuron between train and test, the variance of the input differs. We thus propose a new weight initialization by correcting for the influence of dropout rates and an arbitrary nonlinearity's influence on variance through simple corrective scalars. Since Batch Normalization trained with dropout estimates the variance of a layer's incoming distribution with some inputs dropped, the variance also differs between train and test. After training a network with Batch Normalization and dropout, we simply update Batch Normalization's variance moving averages with dropout off and obtain state of the art on CIFAR-10 and CIFAR-100 without data augmentation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dan Hendrycks;Kevin Gimpel", "authorids": "dan@ttic.edu;kgimpel@ttic.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhendrycks2017adjusting,\ntitle={Adjusting for Dropout Variance in Batch Normalization and Weight Initialization},\nauthor={Dan Hendrycks and Kevin Gimpel},\nyear={2017},\nurl={https://openreview.net/forum?id=r1BJLw9ex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1BJLw9ex", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8016220873925807526&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "r1Bjj8qge", "title": "Encoding and Decoding Representations with Sum- and Max-Product Networks", "track": "main", "status": "Reject", "tldr": "Sum-Product Networks can be effectively employed for unsupervised representation learning, when turned into Max-Product Networks, they can also be used as encoder-decoders", "abstract": "Sum-Product networks (SPNs) are expressive deep architectures for representing probability distributions, yet allowing exact and efficient inference. SPNs have been successfully applied in several domains, however always as black-box distribution estimators. In this paper, we argue that due to their recursive definition, SPNs can also be naturally employed as hierarchical feature extractors and thus for unsupervised representation learning. Moreover, when converted into Max-Product Networks (MPNs), it is possible to decode such representations back into the original input space. In this way, MPNs can be interpreted as a kind of generative autoencoder, even if they were never trained to reconstruct the input data. We show how these learned representations, if visualized, indeed correspond to \"meaningful parts\" of the training data. They also yield a large improvement when used in structured prediction tasks. As shown in extensive experiments, SPN and MPN encoding and decoding schemes prove very competitive against the ones employing RBMs and other stacked autoencoder architectures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Antonio Vergari;Robert Peharz;Nicola Di Mauro;Floriana Esposito", "authorids": "antonio.vergari@uniba.it;robert.peharz@medunigraz.at;nicola.dimauro@uniba.it;floriana.esposito@uniba.it", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nvergari2017encoding,\ntitle={Encoding and Decoding Representations with Sum- and Max-Product Networks},\nauthor={Antonio Vergari and Robert Peharz and Nicola Di Mauro and Floriana Esposito},\nyear={2017},\nurl={https://openreview.net/forum?id=r1Bjj8qge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1Bjj8qge", "pdf_size": 0, "rating": "3;6;6", "confidence": "3;4;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 19, "authors#_avg": 4, "corr_rating_confidence": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1682260695328606924&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "r1Chut9xl", "title": "Inference and Introspection in Deep Generative Models of Sparse Data", "track": "main", "status": "Reject", "tldr": "We study two techniques to improve learning in deep generative models on sparse, high-dimensional text data. We also propose an algorithmic tool to visualize and introspect arbitrarily deep learned models.\u00a0", "abstract": "Deep generative models such as deep latent Gaussian models (DLGMs) are powerful and popular density estimators. However, they have been applied almost exclusively to dense data such as images; DLGMs are rarely applied to sparse, high-dimensional integer data such as word counts or product ratings. One reason is that the standard training procedures find poor local optima when applied to such data. We propose two techniques that alleviate this problem, significantly improving our ability to fit DLGMs to sparse, high-dimensional data. Having fit these models, we are faced with another challenge: how to use and interpret the representation that we have learned? To that end, we propose a method that extracts distributed representations of features via a simple linearization of the model. ", "keywords": "Unsupervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Rahul G. Krishnan;Matthew Hoffman", "authorids": "rahul@cs.nyu.edu;matthoffm@adobe.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkrishnan2017inference,\ntitle={Inference and Introspection in Deep Generative Models of Sparse Data},\nauthor={Rahul G. Krishnan and Matthew Hoffman},\nyear={2017},\nurl={https://openreview.net/forum?id=r1Chut9xl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer5;AnonReviewer3", "site": "https://openreview.net/forum?id=r1Chut9xl", "pdf_size": 0, "rating": "5;5;6;7", "confidence": "4;3;4;3", "rating_avg": 5.75, "confidence_avg": 3.5, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": -0.30151134457776363, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14838199466791114736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "r1G4z8cge", "title": "Mollifying Networks", "track": "main", "status": "Poster", "tldr": "We are proposing a new continuation method for neural networks, that starts from optimizing a convex objective function and gradually during the training the function evolves into more non-convex function.", "abstract": "The optimization of deep neural networks can be more challenging than the traditional convex optimization problems due to highly non-convex nature of the loss function, e.g. it can involve pathological landscapes such as saddle-surfaces that can be difficult to escape from for algorithms based on simple gradient descent. In this paper, we attack the problem of optimization of highly non-convex neural networks objectives by starting with a smoothed -- or mollified -- objective function which becomes more complex as the training proceeds. Our proposition is inspired by the recent studies in continuation methods: similarly to curriculum methods, we begin by learning an easier (possibly convex) objective function and let it evolve during training until it eventually becomes the original, difficult to optimize objective function. The complexity of the mollified networks is controlled by a single hyperparameter that is annealed during training. We show improvements on various difficult optimization tasks and establish a relationship between recent works on continuation methods for neural networks and mollifiers.\n", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Caglar Gulcehre;Marcin Moczulski;Francesco Visin;Yoshua Bengio", "authorids": "gulcehrc@iro.umontreal.ca;marcin-m@post.pl;fvisin@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ngulcehre2017mollifying,\ntitle={Mollifying Networks},\nauthor={Caglar Gulcehre and Marcin Moczulski and Francesco Visin and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1G4z8cge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1G4z8cge", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6680511322029006078&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "r1GKzP5xx", "title": "Recurrent Normalization Propagation", "track": "main", "status": "Workshop", "tldr": "Extension of Normalization Propagation to the LSTM.", "abstract": "We propose a LSTM parametrization that preserves the means and variances of the hidden states and memory cells across time. While having training benefits similar to Recurrent Batch Normalization and Layer Normalization, it does not need to estimate statistics at each time step, therefore, requiring fewer computations overall. We also investigate the parametrization impact on the gradient flows and present a way of initializing the weights accordingly.\n\nWe evaluate our proposal on language modelling and image generative modelling tasks. We empirically show that it performs similarly or better than other recurrent normalization approaches, while being faster to execute.", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "C\u00e9sar Laurent;Nicolas Ballas;Pascal Vincent", "authorids": "cesar.laurent@umontreal.ca;nicolas.ballas@umontreal.ca;pascal.vincent@umontreal.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1GKzP5xx", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7975972866647795459&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "r1IRctqxg", "title": "Sample Importance in Training Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The contribution of each sample during model training varies across training iterations and the model's parameters. We define the concept of sample importance as the change in parameters induced by a sample. In this paper, we explored the sample importance in training deep neural networks using stochastic gradient descent. We found that \"easy\" samples -- samples that are correctly and confidently classified at the end of the training -- shape parameters closer to the output, while the \"hard\" samples impact parameters closer to the input to the network. Further, \"easy\" samples are relevant in the early training stages, and \"hard\" in the late training stage. Further, we show that constructing batches which contain samples of comparable difficulties tends to be a poor strategy compared to maintaining a mix of both hard and easy samples in all of the batches. Interestingly, this contradicts some of the results on curriculum learning which suggest that ordering training examples in terms of difficulty can lead to better performance.", "keywords": "Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Tianxiang Gao;Vladimir Jojic", "authorids": "tgao@cs.unc.edu;vjojic@cs.unc.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngao2017sample,\ntitle={Sample Importance in Training Deep Neural Networks},\nauthor={Tianxiang Gao and Vladimir Jojic},\nyear={2017},\nurl={https://openreview.net/forum?id=r1IRctqxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1IRctqxg", "pdf_size": 0, "rating": "2;3;7", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6706572133907336052&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1LXit5ee", "title": "Episodic Exploration for Deep Deterministic Policies for StarCraft Micromanagement", "track": "main", "status": "Poster", "tldr": "We propose a new reinforcement learning algorithm based on zero order optimization, that we evaluate on StarCraft micromanagement scenarios.", "abstract": "We consider scenarios from the real-time strategy game StarCraft as benchmarks for reinforcement learning algorithms. We focus on micromanagement, that is, the short-term, low-level control of team members during a battle. We propose several scenarios that are challenging for reinforcement learning algorithms because the state- action space is very large, and there is no obvious feature representation for the value functions. We describe our approach to tackle the micromanagement scenarios with deep neural network controllers from raw state features given by the game engine. We also present a heuristic reinforcement learning algorithm which combines direct exploration in the policy space and backpropagation. This algorithm collects traces for learning using deterministic policies, which appears much more efficient than, e.g., \u03b5-greedy exploration. Experiments show that this algorithm allows to successfully learn non-trivial strategies for scenarios with armies of up to 15 agents, where both Q-learning and REINFORCE struggle.", "keywords": "Deep learning;Reinforcement Learning;Games", "primary_area": "", "supplementary_material": "", "author": "Nicolas Usunier;Gabriel Synnaeve;Zeming Lin;Soumith Chintala", "authorids": "usunier@fb.com;gab@fb.com;zlin@fb.com;soumith@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nusunier2017episodic,\ntitle={Episodic Exploration for Deep Deterministic Policies for StarCraft Micromanagement},\nauthor={Nicolas Usunier and Gabriel Synnaeve and Zeming Lin and Soumith Chintala},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1LXit5ee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1LXit5ee", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7736750993336226828&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "r1PRvK9el", "title": "Implicit ReasoNet: Modeling Large-Scale Structured Relationships with Shared Memory", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent studies on knowledge base completion, the task of recovering missing relationships based on recorded relations, demonstrate the importance of learning embeddings from multi-step relations. However, due to the size of knowledge bases, learning multi-step relations directly on top of observed instances could be costly. In this paper, we propose Implicit ReasoNets (IRNs), which is designed to perform large-scale inference implicitly through a search controller and shared memory. Unlike previous work, IRNs use training data to learn to perform multi-step inference through the shared memory, which is also jointly updated during training. While the inference procedure is not operating on top of observed instances for IRNs, our proposed model outperforms all previous approaches on the popular FB15k benchmark by more than 5.7%.", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Yelong Shen*;Po-Sen Huang*;Ming-Wei Chang;Jianfeng Gao", "authorids": "yeshen@microsoft.com;pshuang@microsoft.com;minchang@microsoft.com;jfgao@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshen*2017implicit,\ntitle={Implicit ReasoNet: Modeling Large-Scale Structured Relationships with Shared Memory},\nauthor={Yelong Shen* and Po-Sen Huang* and Ming-Wei Chang and Jianfeng Gao},\nyear={2017},\nurl={https://openreview.net/forum?id=r1PRvK9el}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1PRvK9el", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9051793235224236499&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "r1R5Z19le", "title": "Semi-supervised deep learning by metric embedding", "track": "main", "status": "Workshop", "tldr": "", "abstract": "Deep networks are successfully used as classification models yielding state-of-the-art results when trained on a large number of labeled samples. These models, however, are usually much less suited for semi-supervised problems because of their tendency to overfit easily when trained on small amounts of data. In this work we will explore a new training objective that is targeting a semi-supervised regime with only a small subset of labeled data. This criterion is based on a deep metric embedding over distance relations within the set of labeled samples, together with constraints over the embeddings of the unlabeled set. The final learned representations are discriminative in euclidean space, and hence can be used with subsequent nearest-neighbor classification using the labeled samples.", "keywords": "Deep learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Elad Hoffer;Nir Ailon", "authorids": "ehoffer@tx.technion.ac.il;nailon@cs.technion.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1R5Z19le", "pdf_size": 0, "rating": "4;6", "confidence": "4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11128578925116265303&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "r1S083cgx", "title": "Sequence generation with a physiologically plausible model of handwriting and Recurrent Mixture Density Networks", "track": "main", "status": "Reject", "tldr": "To explore the feasibility and potential benefits of using a physiological plausible model of handwriting as a feature representation for sequence generation with recurrent mixture density networks", "abstract": "The purpose of this study is to explore the feasibility and potential benefits of using a physiological plausible model of handwriting as a feature representation for sequence generation with recurrent mixture density networks. We build on recent results in handwriting prediction developed by Graves (2013), and we focus on generating sequences that possess the statistical and dynamic qualities of handwriting and calligraphic art forms. Rather than model raw sequence data, we first preprocess and reconstruct the input training data with a concise representation given by a motor plan (in the form of a coarse sequence of `ballistic' targets) and corresponding dynamic parameters (which define the velocity and curvature of the pen-tip trajectory). This representation provides a number of advantages, such as enabling the system to learn from very few examples by introducing artificial variability in the training data, and mixing of visual and dynamic qualities learned from different datasets.", "keywords": "Deep learning;Supervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Daniel Berio;Memo Akten;Frederic Fol Leymarie;Mick Grierson;R\u00e9jean Plamondon", "authorids": "d.berio@gold.ac.uk;m.akten@ac.uk;ffl@gold.ac.uk;m.grierson@gold.ac.uk;rejean.plamondon@polymtl.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nberio2017sequence,\ntitle={Sequence generation with a physiologically plausible model of handwriting and Recurrent Mixture Density Networks},\nauthor={Daniel Berio and Memo Akten and Frederic Fol Leymarie and Mick Grierson and R{\\'e}jean Plamondon},\nyear={2017},\nurl={https://openreview.net/forum?id=r1S083cgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1S083cgx", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;5", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17608443196873894675&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1Ue8Hcxg", "title": "Neural Architecture Search with Reinforcement Learning", "track": "main", "status": "Oral", "tldr": "", "abstract": "Neural networks are powerful and flexible models that work well for many difficult learning tasks in image, speech and natural language understanding. Despite their success, neural networks are still hard to design. In this paper, we use a recurrent network to generate the model descriptions of neural networks and train this RNN with reinforcement learning to maximize the expected accuracy of the generated architectures on a validation set. On the CIFAR-10 dataset, our method, starting from scratch, can design a novel network architecture that rivals the best human-invented architecture in terms of test set accuracy. Our CIFAR-10 model achieves a test error rate of 3.65, which is 0.09 percent better and 1.05x faster than the previous state-of-the-art model that used a similar architectural scheme. On the Penn Treebank dataset, our model can compose a novel recurrent cell that outperforms the widely-used LSTM cell, and other state-of-the-art baselines. Our cell achieves a test set perplexity of 62.4 on the Penn Treebank, which is 3.6 perplexity better than the previous state-of-the-art model. The cell can also be transferred to the character language modeling task on PTB and achieves a state-of-the-art perplexity of 1.214.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Barret Zoph;Quoc Le", "authorids": "barretzoph@google.com;qvl@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzoph2017neural,\ntitle={Neural Architecture Search with Reinforcement Learning},\nauthor={Barret Zoph and Quoc Le},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1Ue8Hcxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=r1Ue8Hcxg", "pdf_size": 0, "rating": "9;9;9", "confidence": "5;4;4", "rating_avg": 9.0, "confidence_avg": 4.333333333333333, "replies_avg": 28, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 7202, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4164896773666247762&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 17 }, { "id": "r1Usiwcex", "title": "Counterpoint by Convolution", "track": "main", "status": "Reject", "tldr": "NADE generative model of music, with new insights on sampling", "abstract": "Machine learning models of music typically break down the task of composition into a chronological process, composing a piece of music in a single pass from beginning to end. On the contrary, human composers write music in a nonlinear fashion, scribbling motifs here and there, often revisiting choices previously made. We explore the use of blocked Gibbs sampling as an analogue to the human approach, and introduce Coconet, a convolutional neural network in the NADE family of generative models. Despite ostensibly sampling from the same distribution as the NADE ancestral sampling procedure, we find that a blocked Gibbs approach significantly improves sample quality. We provide evidence that this is due to some conditional distributions being poorly modeled. Moreover, we show that even the cheap approximate blocked Gibbs procedure from Yao et al. (2014) yields better samples than ancestral sampling. We demonstrate the versatility of our method on unconditioned polyphonic music generation.", "keywords": "Deep learning;Applications;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Cheng-Zhi Anna Huang;Tim Cooijmans;Adam Roberts;Aaron Courville;Douglas Eck", "authorids": "chengzhiannahuang@gmail.com;tim.cooijmans@umontreal.ca;adarob@google.com;aaron.courville@umontreal.ca;deck@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhuang2017counterpoint,\ntitle={Counterpoint by Convolution},\nauthor={Cheng-Zhi Anna Huang and Tim Cooijmans and Adam Roberts and Aaron Courville and Douglas Eck},\nyear={2017},\nurl={https://openreview.net/forum?id=r1Usiwcex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=r1Usiwcex", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;5;3", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17136106034029671304&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "r1VGvBcxl", "title": "Reinforcement Learning through Asynchronous Advantage Actor-Critic on a GPU", "track": "main", "status": "Poster", "tldr": "Implementation and analysis of the computational aspect of a GPU version of the Asynchronous Advantage Actor-Critic (A3C) algorithm", "abstract": "We introduce a hybrid CPU/GPU version of the Asynchronous Advantage Actor-Critic (A3C) algorithm, currently the state-of-the-art method in reinforcement learning for various gaming tasks. We analyze its computational traits and concentrate on aspects critical to leveraging the GPU's computational power. We introduce a system of queues and a dynamic scheduling strategy, potentially helpful for other asynchronous algorithms as well. Our hybrid CPU/GPU version of A3C, based on TensorFlow, achieves a significant speed up compared to a CPU implementation; we make it publicly available to other researchers at https://github.com/NVlabs/GA3C.", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Mohammad Babaeizadeh;Iuri Frosio;Stephen Tyree;Jason Clemons;Jan Kautz", "authorids": "mb2@uiuc.edu;ifrosio@nvidia.com;styree@nvidia.com;jclemons@nvidia.com;jkautz@nvidia.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nbabaeizadeh2017reinforcement,\ntitle={Reinforcement Learning through Asynchronous Advantage Actor-Critic on a {GPU}},\nauthor={Mohammad Babaeizadeh and Iuri Frosio and Stephen Tyree and Jason Clemons and Jan Kautz},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1VGvBcxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1VGvBcxl", "pdf_size": 0, "rating": "5;7;8", "confidence": "5;5;3", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": -0.7559289460184545, "gs_citation": 385, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8757115672331028243&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "r1VdcHcxx", "title": "Recurrent Batch Normalization", "track": "main", "status": "Poster", "tldr": "Make batch normalization work in recurrent neural networks", "abstract": "We propose a reparameterization of LSTM that brings the benefits of batch normalization to recurrent neural networks. Whereas previous works only apply batch normalization to the input-to-hidden transformation of RNNs, we demonstrate that it is both possible and beneficial to batch-normalize the hidden-to-hidden transition, thereby reducing internal covariate shift between time steps.\n\nWe evaluate our proposal on various sequential problems such as sequence classification, language modeling and question answering. Our empirical results show that our batch-normalized LSTM consistently leads to faster convergence and improved generalization.", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Tim Cooijmans;Nicolas Ballas;C\u00e9sar Laurent;\u00c7a\u011flar G\u00fcl\u00e7ehre;Aaron Courville", "authorids": "tim.cooijmans@umontreal.ca;nicolas.ballas@umontreal.ca;cesar.laurent@umontreal.ca;caglar.gulcehre@umontreal.ca;aaron.courville@umontreal.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ncooijmans2017recurrent,\ntitle={Recurrent Batch Normalization},\nauthor={Tim Cooijmans and Nicolas Ballas and C{\\'e}sar Laurent and {\\c{C}}a{\\u{g}}lar G{\\\"u}l{\\c{c}}ehre and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1VdcHcxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1VdcHcxx", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 536, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16437445141311981298&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "r1WUqIceg", "title": "Improving Stochastic Gradient Descent with Feedback", "track": "main", "status": "Reject", "tldr": "We improve stochastic gradient descent by incorporating feedback from the objective function", "abstract": "In this paper we propose a simple and efficient method for improving stochastic gradient descent methods by using feedback from the objective function. The method tracks the relative changes in the objective function with a running average, and uses it to adaptively tune the learning rate in stochastic gradient descent. We specifically apply this idea to modify Adam, a popular algorithm for training deep neural networks. We conduct experiments to compare the resulting algorithm, which we call Eve, with state of the art methods used for training deep learning models. We train CNNs for image classification, and RNNs for language modeling and question answering. Our experiments show that Eve outperforms all other algorithms on these benchmark tasks. We also analyze the behavior of the feedback mechanism during the training process.\n", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Jayanth Koushik;Hiroaki Hayashi", "authorids": "jkoushik@cs.cmu.edu;hiroakih@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkoushik2017improving,\ntitle={Improving Stochastic Gradient Descent with Feedback},\nauthor={Jayanth Koushik and Hiroaki Hayashi},\nyear={2017},\nurl={https://openreview.net/forum?id=r1WUqIceg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1WUqIceg", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11734354123072206252&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "r1X3g2_xl", "title": "Adversarial Training Methods for Semi-Supervised Text Classification", "track": "main", "status": "Poster", "tldr": "", "abstract": "Adversarial training provides a means of regularizing supervised learning algorithms while virtual adversarial training is able to extend supervised learning algorithms to the semi-supervised setting.\nHowever, both methods require making small perturbations to numerous entries of the input vector, which is inappropriate for sparse high-dimensional inputs such as one-hot word representations.\nWe extend adversarial and virtual adversarial training to the text domain by applying perturbations to the word embeddings in a recurrent neural network rather than to the original input itself.\nThe proposed method achieves state of the art results on multiple benchmark semi-supervised and purely supervised tasks.\nWe provide visualizations and analysis showing that the learned word embeddings have improved in quality and that while training, the model is less prone to overfitting.\n", "keywords": "Natural language processing;Deep learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Takeru Miyato;Andrew M. Dai;Ian Goodfellow", "authorids": "takeru.miyato@gmail.com;adai@google.com;ian@openai.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmiyato2017adversarial,\ntitle={Adversarial Training Methods for Semi-Supervised Text Classification},\nauthor={Takeru Miyato and Andrew M. Dai and Ian Goodfellow},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1X3g2_xl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=r1X3g2_xl", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;5;3", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1425, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6594257289645930121&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16 }, { "id": "r1YNw6sxg", "title": "Learning Visual Servoing with Deep Features and Fitted Q-Iteration", "track": "main", "status": "Poster", "tldr": "We use deep semantic features, learned predictive dynamics, and reinforcement learning to efficiently learn a visual servoing policy that is robust to visual variations.", "abstract": "Visual servoing involves choosing actions that move a robot in response to observations from a camera, in order to reach a goal configuration in the world. Standard visual servoing approaches typically rely on manually designed features and analytical dynamics models, which limits their generalization capability and often requires extensive application-specific feature and model engineering. In this work, we study how learned visual features, learned predictive dynamics models, and reinforcement learning can be combined to learn visual servoing mechanisms. We focus on target following, with the goal of designing algorithms that can learn a visual servo using low amounts of data of the target in question, to enable quick adaptation to new targets. Our approach is based on servoing the camera in the space of learned visual features, rather than image pixels or manually-designed keypoints. We demonstrate that standard deep features, in our case taken from a model trained for object classification, can be used together with a bilinear predictive model to learn an effective visual servo that is robust to visual variation, changes in viewing angle and appearance, and occlusions. A key component of our approach is to use a sample-efficient fitted Q-iteration algorithm to learn which features are best suited for the task at hand. We show that we can learn an effective visual servo on a complex synthetic car following benchmark using just 20 training trajectory samples for reinforcement learning. We demonstrate substantial improvement over a conventional approach based on image pixels or hand-designed keypoints, and we show an improvement in sample-efficiency of more than two orders of magnitude over standard model-free deep reinforcement learning algorithms. Videos are available at http://rll.berkeley.edu/visual_servoing.", "keywords": "Computer vision;Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Alex X. Lee;Sergey Levine;Pieter Abbeel", "authorids": "alexlee_gk@cs.berkeley.edu;svlevine@cs.berkeley.edu;pabbeel@cs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlee2017learning,\ntitle={Learning Visual Servoing with Deep Features and Fitted Q-Iteration},\nauthor={Alex X. Lee and Sergey Levine and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1YNw6sxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=r1YNw6sxg", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3206092082961124818&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "r1aGWUqgg", "title": "Unsupervised Learning of State Representations for Multiple Tasks", "track": "main", "status": "Reject", "tldr": "Learning method for automatic detection of multiple reinforcement tasks and extraction of state representations from raw observations", "abstract": "We present an approach for learning state representations in multi-task reinforcement learning. Our method learns multiple low-dimensional state representations from raw observations in an unsupervised fashion, without any knowledge of which task is executed, nor of the number of tasks involved.\nThe method is based on a gated neural network architecture, trained with an extension of the learning with robotic priors objective. In simulated experiments, we show that our method is able to learn better state representations for reinforcement learning, and we analyze why and when it manages to do so.", "keywords": "Reinforcement Learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Antonin Raffin;Sebastian H\u00f6fer;Rico Jonschkowski;Oliver Brock;Freek Stulp", "authorids": "antonin.raffin@ensta-paristech.fr;sebastian.hoefer@tu-berlin.de;rico.jonschkowski@tu-berlin.de;oliver.brock@tu-berlin.de;freek.stulp@dlr.de", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nraffin2017unsupervised,\ntitle={Unsupervised Learning of State Representations for Multiple Tasks},\nauthor={Antonin Raffin and Sebastian H{\\\"o}fer and Rico Jonschkowski and Oliver Brock and Freek Stulp},\nyear={2017},\nurl={https://openreview.net/forum?id=r1aGWUqgg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1aGWUqgg", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6842606284916380507&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1aPbsFle", "title": "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recurrent neural networks have been very successful at predicting sequences of words in tasks such as language modeling. However, all such models are based on the conventional classification framework, where the model is trained against one-hot targets, and each word is represented both as an input and as an output in isolation. This causes inefficiencies in learning both in terms of utilizing all of the information and in terms of the number of parameters needed to train. We introduce a novel theoretical framework that facilitates better learning in language modeling, and show that our framework leads to tying together the input embedding and the output projection matrices, greatly reducing the number of trainable variables. Our framework leads to state of the art performance on the Penn Treebank with a variety of network models.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Hakan Inan;Khashayar Khosravi;Richard Socher", "authorids": "inanh@stanford.edu;khosravi@stanford.edu;rsocher@salesforce.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ninan2017tying,\ntitle={Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling},\nauthor={Hakan Inan and Khashayar Khosravi and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1aPbsFle}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1aPbsFle", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 448, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14036439381566283404&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "r1br_2Kge", "title": "Short and Deep: Sketching and Neural Networks", "track": "main", "status": "Workshop", "tldr": "For sparse boolean inputs, Neural Networks operating on very short sketches can provably and empirically represent a large class of functions.", "abstract": "Data-independent methods for dimensionality reduction such as random projections, sketches, and feature hashing have become increasingly popular in recent years. These methods often seek to reduce dimensionality while preserving the hypothesis class, resulting in inherent lower bounds on the size of projected data. For example, preserving linear separability requires $\\Omega(1/\\gamma^2)$ dimensions, where $\\gamma$ is the margin, and in the case of polynomial functions, the number of required dimensions has an exponential dependence on the polynomial degree.\n \nDespite these limitations, we show that the dimensionality can be reduced further while maintaining performance guarantees, using improper learning with a slightly larger hypothesis class. In particular, we show that any sparse polynomial function of a sparse binary vector can be computed from a compact sketch by a single-layer neural network, where the sketch size has a logarithmic dependence on the polynomial degree.\n \nA practical consequence is that networks trained on sketched data are compact, and therefore suitable for settings with memory and power constraints. We empirically show that our approach leads to networks with fewer parameters than related methods such as feature hashing, at equal or better performance.", "keywords": "Theory", "primary_area": "", "supplementary_material": "", "author": "Amit Daniely;Nevena Lazic;Yoram Singer;Kunal Talwar", "authorids": "amitdaniely@google.com;nevena@google.com;singer@google.com;kunal@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1br_2Kge", "pdf_size": 0, "rating": "4;5;5", "confidence": "2;4;2", "rating_avg": 4.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9858238498975266247&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "r1fYuytex", "title": "Sparsely-Connected Neural Networks: Towards Efficient VLSI Implementation of Deep Neural Networks", "track": "main", "status": "Poster", "tldr": "We show that the number of connections in fully-connected networks can be reduced by up to 90% while improving the accuracy performance.", "abstract": "Recently deep neural networks have received considerable attention due to their ability to extract and represent high-level abstractions in data sets. Deep neural networks such as fully-connected and convolutional neural networks have shown excellent performance on a wide range of recognition and classification tasks. However, their hardware implementations currently suffer from large silicon area and high power consumption due to the their high degree of complexity. The power/energy consumption of neural networks is dominated by memory accesses, the majority of which occur in fully-connected networks. In fact, they contain most of the deep neural network parameters. In this paper, we propose sparsely-connected networks, by showing that the number of connections in fully-connected networks can be reduced by up to 90% while improving the accuracy performance on three popular datasets (MNIST, CIFAR10 and SVHN). We then propose an efficient hardware architecture based on linear-feedback shift registers to reduce the memory requirements of the proposed sparsely-connected networks. The proposed architecture can save up to 90% of memory compared to the conventional implementations of fully-connected neural networks. Moreover, implementation results show up to 84% reduction in the energy consumption of a single neuron of the proposed sparsely-connected networks compared to a single neuron of fully-connected neural networks.", "keywords": "Deep learning;Applications;Optimization", "primary_area": "", "supplementary_material": "", "author": "Arash Ardakani;Carlo Condo;Warren J. Gross", "authorids": "arash.ardakani@mail.mcgill.ca;carlo.condo@mail.mcgill.ca;warren.gross@mcgill.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nardakani2017sparselyconnected,\ntitle={Sparsely-Connected Neural Networks: Towards Efficient {VLSI} Implementation of Deep Neural Networks},\nauthor={Arash Ardakani and Carlo Condo and Warren J. Gross},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1fYuytex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1fYuytex", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8908276848272854100&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "r1kGbydxg", "title": "Learning Locomotion Skills Using DeepRL: Does the Choice of Action Space Matter?", "track": "main", "status": "Reject", "tldr": "We compare the impact of four different action parameterizations (torques, muscle-activations, target joint angles, and target joint-angle velocities) in terms of learning time, policy robustness, motion quality, and policy query rates.", "abstract": "The use of deep reinforcement learning allows for high-dimensional state descriptors, but little is known about how the choice of action representation impacts the learning difficulty and the resulting performance. We compare the impact of four different action parameterizations (torques, muscle-activations, target joint angles, and target joint-angle velocities) in terms of learning time, policy robustness, motion quality, and policy query rates. Our results are evaluated on a gait cycle imitation task for multiple planar articulated figures and multiple gaits. We demonstrate that the local feedback provided by higher-level action parameterizations can significantly impact the learning, robustness, and quality of the resulting policies.", "keywords": "Reinforcement Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Xue Bin Peng;Michiel van de Panne", "authorids": "xbpeng@cs.ubc.ca;van@cs.ubc.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npeng2017learning,\ntitle={Learning Locomotion Skills Using Deep{RL}: Does the Choice of Action Space Matter?},\nauthor={Xue Bin Peng and Michiel van de Panne},\nyear={2017},\nurl={https://openreview.net/forum?id=r1kGbydxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1kGbydxg", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 239, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8497884097534841322&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "r1kQkVFgl", "title": "Learning Python Code Suggestion with a Sparse Pointer Network", "track": "main", "status": "Reject", "tldr": "We augment a neural language model with a pointer network for code suggestion that is specialized to referring to predefined groups of identifiers", "abstract": "To enhance developer productivity, all modern integrated development environments (IDEs) include code suggestion functionality that proposes likely next tokens at the cursor. While current IDEs work well for statically-typed languages, their reliance on type annotations means that they do not provide the same level of support for dynamic programming languages as for statically-typed languages. Moreover, suggestion engines in modern IDEs do not propose expressions or multi-statement idiomatic code. Recent work has shown that language models can improve code suggestion systems by learning from software repositories. This paper introduces a neural language model with a sparse pointer network aimed at capturing very long range dependencies. We release a large-scale code suggestion corpus of 41M lines of Python code crawled from GitHub. On this corpus, we found standard neural language models to perform well at suggesting local phenomena, but struggle to refer to identifiers that are introduced many tokens in the past. By augmenting a neural language model with a pointer network specialized in referring to predefined classes of identifiers, we obtain a much lower perplexity and a 5 percentage points increase in accuracy for code suggestion compared to an LSTM baseline. In fact, this increase in code suggestion accuracy is due to a 13 times more accurate prediction of identifiers. Furthermore, a qualitative analysis shows this model indeed captures interesting long-range dependencies, like referring to a class member defined over 60 tokens in the past.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Avishkar Bhoopchand;Tim Rockt\u00e4schel;Earl Barr;Sebastian Riedel", "authorids": "avishkar.bhoopchand.15@ucl.ac.uk;t.rocktaschel@cs.ucl.ac.uk;e.barr@cs.ucl.ac.uk;s.riedel@cs.ucl.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbhoopchand2017learning,\ntitle={Learning Python Code Suggestion with a Sparse Pointer Network},\nauthor={Avishkar Bhoopchand and Tim Rockt{\\\"a}schel and Earl Barr and Sebastian Riedel},\nyear={2017},\nurl={https://openreview.net/forum?id=r1kQkVFgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1kQkVFgl", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10813503799302771023&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "r1nTpv9eg", "title": "Learning to Perform Physics Experiments via Deep Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "We train agents to conduct experiments in interactive simulated physical environments.", "abstract": "When encountering novel objects, humans are able to infer a wide range of physical properties such as mass, friction and deformability by interacting with them in a goal driven way. This process of active interaction is in the same spirit as a scientist performing experiments to discover hidden facts. Recent advances in artificial intelligence have yielded machines that can achieve superhuman performance in Go, Atari, natural language processing, and complex control problems; however, it is not clear that these systems can rival the scientific intuition of even a young child. In this work we introduce a basic set of tasks that require agents to estimate properties such as mass and cohesion of objects in an interactive simulated environment where they can manipulate the objects and observe the consequences. We found that deep reinforcement learning methods can learn to perform the experiments necessary to discover such hidden properties. By systematically manipulating the problem difficulty and the cost incurred by the agent for performing experiments, we found that agents learn different strategies that balance the cost of gathering information against the cost of making mistakes in different situations. We also compare our learned experimentation policies to randomized baselines and show that the learned policies lead to better predictions.", "keywords": "Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Misha Denil;Pulkit Agrawal;Tejas D Kulkarni;Tom Erez;Peter Battaglia;Nando de Freitas", "authorids": "mdenil@google.com;pulkitag@berkeley.edu;tkulkarni@google.com;etom@google.com;peterbattaglia@google.com;nandodefreitas@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ndenil2017learning,\ntitle={Learning to Perform Physics Experiments via Deep Reinforcement Learning},\nauthor={Misha Denil and Pulkit Agrawal and Tejas D Kulkarni and Tom Erez and Peter Battaglia and Nando de Freitas},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1nTpv9eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer7;AnonReviewer3;AnonReviewer6;AnonReviewer5", "site": "https://openreview.net/forum?id=r1nTpv9eg", "pdf_size": 0, "rating": "6;7;7;7", "confidence": "3;4;3;3", "rating_avg": 6.75, "confidence_avg": 3.25, "replies_avg": 20, "authors#_avg": 6, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13142558595749186250&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "r1osyr_xg", "title": "Fuzzy paraphrases in learning word representations with a lexicon", "track": "main", "status": "Reject", "tldr": "We propose a novel idea to address polysemy problem by annotating paraphrases with a degree of reliability like a member of a fuzzy set.", "abstract": "A synonym of a polysemous word is usually only the paraphrase of one sense among many. When lexicons are used to improve vector-space word representations, such paraphrases are unreliable and bring noise to the vector-space. The prior works use a coefficient to adjust the overall learning of the lexicons. They regard the paraphrases equally. \nIn this paper, we propose a novel approach that regards the paraphrases diversely to alleviate the adverse effects of polysemy. We annotate each paraphrase with a degree of reliability. The paraphrases are randomly eliminated according to the degrees when our model learns word representations. In this way, our approach drops the unreliable paraphrases, keeping more reliable paraphrases at the same time. The experimental results show that the proposed method improves the word vectors.\nOur approach is an attempt to address the polysemy problem keeping one vector per word. It makes the approach easier to use than the conventional methods that estimate multiple vectors for a word. Our approach also outperforms the prior works in the experiments.", "keywords": "Natural language processing;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Yuanzhi Ke;Masafumi Hagiwara", "authorids": "enshika8811.a6@keio.jp;hagiwara@keio.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nke2017fuzzy,\ntitle={Fuzzy paraphrases in learning word representations with a lexicon},\nauthor={Yuanzhi Ke and Masafumi Hagiwara},\nyear={2017},\nurl={https://openreview.net/forum?id=r1osyr_xg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1osyr_xg", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;3;4", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 2, "corr_rating_confidence": -0.18898223650461363, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8064672380114578962&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "r1rhWnZkg", "title": "Hadamard Product for Low-rank Bilinear Pooling", "track": "main", "status": "Poster", "tldr": "A new state-of-the-art on the VQA (real image) dataset using an attention mechanism of low-rank bilinear pooling", "abstract": "Bilinear models provide rich representations compared with linear models. They have been applied in various visual tasks, such as object recognition, segmentation, and visual question-answering, to get state-of-the-art performances taking advantage of the expanded representations. However, bilinear representations tend to be high-dimensional, limiting the applicability to computationally complex tasks. We propose low-rank bilinear pooling using Hadamard product for an efficient attention mechanism of multimodal learning. We show that our model outperforms compact bilinear pooling in visual question-answering tasks with the state-of-the-art results on the VQA dataset, having a better parsimonious property.", "keywords": "Deep learning;Supervised Learning;Multi-modal learning", "primary_area": "", "supplementary_material": "", "author": "Jin-Hwa Kim;Kyoung-Woon On;Woosang Lim;Jeonghee Kim;Jung-Woo Ha;Byoung-Tak Zhang", "authorids": "jnhwkim@snu.ac.kr;kwon@bi.snu.ac.kr;quasar17@kaist.ac.kr;jeonghee.kim@navercorp.com;jungwoo.ha@navercorp.com;btzhang@bi.snu.ac.kr", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nkim2017hadamard,\ntitle={Hadamard Product for Low-rank Bilinear Pooling},\nauthor={Jin-Hwa Kim and Kyoung-Woon On and Woosang Lim and Jeonghee Kim and Jung-Woo Ha and Byoung-Tak Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1rhWnZkg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1rhWnZkg", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;5;3", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 25, "authors#_avg": 6, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 921, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11247536386590839195&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "r1rz6U5lg", "title": "Learning to superoptimize programs", "track": "main", "status": "Poster", "tldr": "", "abstract": " Code super-optimization is the task of transforming any given program to a more efficient version while preserving its input-output behaviour. In some sense, it is similar to the paraphrase problem from natural language processing where the intention is to change the syntax of an utterance without changing its semantics. Code-optimization has been the subject of years of research that has resulted in the development of rule-based transformation strategies that are used by compilers. More recently, however, a class of stochastic search based methods have been shown to outperform these strategies. This approach involves repeated sampling of modifications to the program from a proposal distribution, which are accepted or rejected based on whether they preserve correctness, and the improvement they achieve. These methods, however, neither learn from past behaviour nor do they try to leverage the semantics of the program under consideration. Motivated by this observation, we present a novel learning based approach for code super-optimization. Intuitively, our method works by learning the proposal distribution using unbiased estimators of the gradient of the expected improvement. Experiments on benchmarks comprising of automatically generated as well as existing (``Hacker's Delight'') programs show that the proposed method is able to significantly outperform state of the art approaches for code super-optimization.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rudy Bunel;Alban Desmaison;M. Pawan Kumar;Philip H.S. Torr;Pushmeet Kohli", "authorids": "rudy@robots.ox.ac.uk;alban@robots.ox.ac.uk;pawan@robots.ox.ac.uk;philip.torr@eng.ox.ac.uk;pkohli@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nbunel2017learning,\ntitle={Learning to superoptimize programs},\nauthor={Rudy Bunel and Alban Desmaison and M. Pawan Kumar and Philip H.S. Torr and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=r1rz6U5lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=r1rz6U5lg", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;4;4", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 17, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10640586058639418377&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8 }, { "id": "r1tHvHKge", "title": "Combating Deep Reinforcement Learning's Sisyphean Curse with Intrinsic Fear", "track": "main", "status": "Reject", "tldr": "Owing to function approximation, DRL agents eventually forget about dangerous transitions once they learn to avoid them, putting them at risk of perpetually repeating mistakes. We propose techniques to avert catastrophic outcomes.", "abstract": "To use deep reinforcement learning in the wild, we might hope for an agent that can avoid catastrophic mistakes. Unfortunately, even in simple environments, the popular deep Q-network (DQN) algorithm is doomed by a Sisyphean curse. Owing to the use of function approximation, these agents eventually forget experiences as they become exceedingly unlikely under a new policy. Consequently, for as long as they continue to train, DQNs may periodically relive catastrophic mistakes. Many real-world environments where people might be injured exhibit a special structure. We know a priori that catastrophes are not only bad, but that agents need not ever get near to a catastrophe state. In this paper, we exploit this structure to learn a reward-shaping that accelerates learning and guards oscillating policies against repeated catastrophes. First, we demonstrate unacceptable performance of DQNs on two toy problems. We then introduce intrinsic fear, a new method that mitigates these problems by avoiding dangerous states. Our approach incorporates a second model trained via supervised learning to predict the probability of catastrophe within a short number of steps. This score then acts to penalize the Q-learning objective, shaping the reward function away from catastrophic states.", "keywords": "Deep learning;Reinforcement Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Zachary C. Lipton;Jianfeng Gao;Lihong Li;Jianshu Chen;Li Deng", "authorids": "zlipton@cs.ucsd.edu;jfgao@microsoft.com;lihongli.cs@gmail.com;jianshuc@microsoft.com;deng@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nlipton2017combating,\ntitle={Combating Deep Reinforcement Learning's Sisyphean Curse with Intrinsic Fear},\nauthor={Zachary C. Lipton and Jianfeng Gao and Lihong Li and Jianshu Chen and Li Deng},\nyear={2017},\nurl={https://openreview.net/forum?id=r1tHvHKge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1tHvHKge", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;4;2", "rating_avg": 4.333333333333333, "confidence_avg": 3.0, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13012810639595459585&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "r1te3Fqel", "title": "End-to-End Answer Chunk Extraction and Ranking for Reading Comprehension", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper proposes dynamic chunk reader (DCR), an end-to-end neural reading comprehension (RC) model that is able to extract and rank a set of answer candidates from a given document to answer questions. DCR is able to predict answers of variable lengths, whereas previous neural RC models primarily focused on predicting single tokens or entities. DCR encodes a document and an input question with recurrent neural networks, and then applies a word-by-word attention mechanism to acquire question-aware representations for the document, followed by the generation of chunk representations and a ranking module to propose the top-ranked chunk as the answer. Experimental results show that DCR could achieve a 66.3% Exact match and 74.7% F1 score on the Stanford Question Answering Dataset.", "keywords": "Natural language processing;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Yang Yu;Wei Zhang;Bowen Zhou;Kazi Hasan;Mo Yu;Bing Xiang", "authorids": "yu@us.ibm.com;zhangwei@us.ibm.com;zhou@us.ibm.com;kshasan@us.ibm.com;yum@us.ibm.com;bingxia@us.ibm.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nyu2017endtoend,\ntitle={End-to-End Answer Chunk Extraction and Ranking for Reading Comprehension},\nauthor={Yang Yu and Wei Zhang and Bowen Zhou and Kazi Hasan and Mo Yu and Bing Xiang},\nyear={2017},\nurl={https://openreview.net/forum?id=r1te3Fqel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1te3Fqel", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;3;3", "rating_avg": 5.0, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13293531697892528036&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "r1w7Jdqxl", "title": "Collaborative Deep Embedding via Dual Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the long history of research on recommender systems, current approaches still face a number of challenges in practice, e.g. the difficulties in handling new items, the high diversity of user interests, and the noisiness and sparsity of observations. Many of such difficulties stem from the lack of expressive power to capture the complex relations between items and users. This paper presents a new method to tackle this problem, called Collaborative Deep Embedding. In this method, a pair of dual networks, one for encoding items and the other for users, are jointly trained in a collaborative fashion. \nParticularly, both networks produce embeddings at multiple aligned levels, which, when combined together, can accurately predict the matching between items and users. Compared to existing methods, the proposed one not only provides greater expressive power to capture complex matching relations, but also generalizes better to unseen items or users. On multiple real-world datasets, this method outperforms the state of the art.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yilei Xiong;Dahua Lin;Haoying Niu;JIefeng Cheng;Zhenguo Li", "authorids": "xy014@ie.cuhk.edu.hk;dhlin@ie.cuhk.edu.hk;niu.haoying@huawei.com;cheng.jiefeng@huawei.com;li.zhenguo@huawei.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nxiong2017collaborative,\ntitle={Collaborative Deep Embedding via Dual Networks},\nauthor={Yilei Xiong and Dahua Lin and Haoying Niu and JIefeng Cheng and Zhenguo Li},\nyear={2017},\nurl={https://openreview.net/forum?id=r1w7Jdqxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1w7Jdqxl", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12361491004932517027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1xUYDYgg", "title": "Development of JavaScript-based deep learning platform and application to distributed training", "track": "main", "status": "Workshop", "tldr": "Development of JavaScript-based matrix library and deep learning library which uses GPGPU. VGGNet is trained distributedly using web browsers.", "abstract": "Deep learning is increasingly attracting attention for processing big data.\nExisting frameworks for deep learning must be set up to specialized computer systems. Gaining sufficient computing resources therefore entails high costs of deployment and maintenance.\nIn this work, we implement a matrix library and deep learning framework that uses JavaScript. It can run on web browsers operating on ordinary personal computers and smartphones.\nUsing JavaScript, deep learning can be accomplished in widely diverse environments without the necessity for software installation. Using GPGPU from WebCL framework, our framework can train large scale convolutional neural networks such as VGGNet and ResNet.\nIn the experiments, we demonstrate their practicality by training VGGNet in a distributed manner using web browsers as the client.\n", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Masatoshi Hidaka;Ken Miura;Tatsuya Harada", "authorids": "hidaka@mi.t.u-tokyo.ac.jp;miura@mi.t.u-tokyo.ac.jp;harada@mi.t.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1xUYDYgg", "pdf_size": 0, "rating": "4;6;7", "confidence": "2;4;3", "rating_avg": 5.666666666666667, "confidence_avg": 3.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.6546536707079772, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8333595116521890538&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4 }, { "id": "r1y1aawlg", "title": "Iterative Refinement for Machine Translation", "track": "main", "status": "Reject", "tldr": "We propose of novel decoding strategy for MT, after producing a full sentence the model can revisit its choice and substitute words; multiple words can iteratively be edited.", "abstract": "Existing machine translation decoding algorithms generate translations in a strictly monotonic fashion and never revisit previous decisions. As a result, earlier mistakes cannot be corrected at a later stage. In this paper, we present a translation scheme that starts from an initial guess and then makes iterative improvements that may revisit previous decisions. We parameterize our model as a convolutional neural network that predicts discrete substitutions to an existing translation based on an attention mechanism over both the source sentence as well as the current translation output. By making less than one modification per sentence, we improve the output of a phrase-based translation system by up to 0.4 BLEU on WMT15 German-English translation.", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Roman Novak;Michael Auli;David Grangier", "authorids": "roman.novak@polytechnique.edu;michaelauli@fb.com;grangier@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnovak2017iterative,\ntitle={Iterative Refinement for Machine Translation},\nauthor={Roman Novak and Michael Auli and David Grangier},\nyear={2017},\nurl={https://openreview.net/forum?id=r1y1aawlg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer6;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1y1aawlg", "pdf_size": 0, "rating": "4;5;5;7", "confidence": "4;5;3;3", "rating_avg": 5.25, "confidence_avg": 3.75, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.48420012470625223, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1840474479648957296&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "r1yjkAtxe", "title": "Spatio-Temporal Abstractions in Reinforcement Learning Through Neural Encoding", "track": "main", "status": "Reject", "tldr": "A method for understanding and improving deep agents by creating spatio-temporal abstractions", "abstract": "Recent progress in the field of Reinforcement Learning (RL) has enabled to tackle bigger and more challenging tasks. However, the increasing complexity of the problems, as well as the use of more sophisticated models such as Deep Neural Networks (DNN), impedes the understanding of artificial agents behavior. In this work, we present the Semi-Aggregated Markov Decision Process (SAMDP) model. The purpose of the SAMDP modeling is to describe and allow a better understanding of complex behaviors by identifying temporal and spatial abstractions. In contrast to other modeling approaches, SAMDP is built in a transformed state-space that encodes the dynamics of the problem. We show that working with the \\emph{right} state representation mitigates the problem of finding spatial and temporal abstractions. We describe the process of building the SAMDP model from observed trajectories and give examples for using it in a toy problem and complicated DQN policies. Finally, we show how using the SAMDP we can monitor the policy at hand and make it more robust.", "keywords": "Reinforcement Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Nir Baram;Tom Zahavy;Shie Mannor", "authorids": "nirb@campus.technion.ac.il;tomzahavy@campus.technion.ac.il;shie@ee.technion.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbaram2017spatiotemporal,\ntitle={Spatio-Temporal Abstractions in Reinforcement Learning Through Neural Encoding},\nauthor={Nir Baram and Tom Zahavy and Shie Mannor},\nyear={2017},\nurl={https://openreview.net/forum?id=r1yjkAtxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1yjkAtxe", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;5", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=303206722460411116&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "rJ0-tY5xe", "title": "Learning to Query, Reason, and Answer Questions On Ambiguous Texts", "track": "main", "status": "Poster", "tldr": "A new dataset QRAQ of ambiguous stories in which an Agent must learn to reason and interact with a User to obtain important missing information needed to answer a challenge question.", "abstract": "A key goal of research in conversational systems is to train an interactive agent to help a user with a task. Human conversation, however, is notoriously incomplete, ambiguous, and full of extraneous detail. To operate effectively, the agent must not only understand what was explicitly conveyed but also be able to reason in the presence of missing or unclear information. When unable to resolve ambiguities on its own, the agent must be able to ask the user for the necessary clarifications and incorporate the response in its reasoning. Motivated by this problem we introduce QRAQ (\"crack\"; Query, Reason, and Answer Questions), a new synthetic domain, in which a User gives an Agent a short story and asks a challenge question. These problems are designed to test the reasoning and interaction capabilities of a learning-based Agent in a setting that requires multiple conversational turns. A good Agent should ask only non-deducible, relevant questions until it has enough information to correctly answer the User's question. We use standard and improved reinforcement learning based memory-network architectures to solve QRAQ problems in the difficult setting where the reward signal only tells the Agent if its final answer to the challenge question is correct or not. To provide an upper-bound to the RL results we also train the same architectures using supervised information that tells the Agent during training which variables to query and the answer to the challenge question. We evaluate our architectures on four QRAQ dataset types, and scale the complexity for each along multiple dimensions.", "keywords": "Natural language processing;Deep learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Xiaoxiao Guo;Tim Klinger;Clemens Rosenbaum;Joseph P. Bigus;Murray Campbell;Ban Kawas;Kartik Talamadupula;Gerry Tesauro;Satinder Singh", "authorids": "tklinger@us.ibm.com;guoxiao@umich.edu;cgbr@cs.umass.edu;jbigus@us.ibm.com;mcam@us.ibm.com;bkawas@us.ibm.com;krtalamad@us.ibm.com;gtesauro@us.ibm.com;baveja@umich.edu", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nguo2017learning,\ntitle={Learning to Query, Reason, and Answer Questions On Ambiguous Texts},\nauthor={Xiaoxiao Guo and Tim Klinger and Clemens Rosenbaum and Joseph P. Bigus and Murray Campbell and Ban Kawas and Kartik Talamadupula and Gerry Tesauro and Satinder Singh},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJ0-tY5xe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJ0-tY5xe", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 14, "authors#_avg": 9, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7051691436350761875&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "rJ0JwFcex", "title": "Neuro-Symbolic Program Synthesis", "track": "main", "status": "Poster", "tldr": "A neural architecture for learning programs in a domain-specific language that are consistent with a given set of input-output examples", "abstract": "Recent years have seen the proposal of a number of neural architectures for the problem of Program Induction. Given a set of input-output examples, these architectures are able to learn mappings that generalize to new test inputs. While achieving impressive results, these approaches have a number of important limitations: (a) they are computationally expensive and hard to train, (b) a model has to be trained for each task (program) separately, and (c) it is hard to interpret or verify the correctness of the learnt mapping (as it is defined by a neural network). In this paper, we propose a novel technique, Neuro-Symbolic Program Synthesis, to overcome the above-mentioned problems. Once trained, our approach can automatically construct computer programs in a domain-specific language that are consistent with a set of input-output examples provided at test time. Our method is based on two novel neural modules. The first module, called the cross correlation I/O network, given a set of input-output examples, produces a continuous representation of the set of I/O examples. The second module, the Recursive-Reverse-Recursive Neural Network (R3NN), given the continuous representation of the examples, synthesizes a program by incrementally expanding partial programs. We demonstrate the effectiveness of our approach by applying it to the rich and complex domain of regular expression based string transformations. Experiments show that the R3NN model is not only able to construct programs from new input-output examples, but it is also able to construct new programs for tasks that it had never observed before during training.", "keywords": "Deep learning;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Emilio Parisotto;Abdel-rahman Mohamed;Rishabh Singh;Lihong Li;Dengyong Zhou;Pushmeet Kohli", "authorids": "eparisot@andrew.cmu.edu;asamir@microsoft.com;risin@microsoft.com;lihongli@microsoft.com;denzho@microsoft.com;pkohli@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nparisotto2017neurosymbolic,\ntitle={Neuro-Symbolic Program Synthesis},\nauthor={Emilio Parisotto and Abdel-rahman Mohamed and Rishabh Singh and Lihong Li and Dengyong Zhou and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJ0JwFcex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJ0JwFcex", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 6, "corr_rating_confidence": -0.18898223650461363, "gs_citation": 418, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1978557704933459244&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rJ6DhP5xe", "title": "Generalizable Features From Unsupervised Learning", "track": "main", "status": "Workshop", "tldr": "Using generated data from a next frame predictor model to make a supervised model generalize better to unseen distributions. ", "abstract": "Humans learn a predictive model of the world and use this model to reason about future events and the consequences of actions. In contrast to most machine predictors, we exhibit an impressive ability to generalize to unseen scenarios and reason intelligently in these settings. One important aspect of this ability is physical intuition(Lake et al., 2016). In this work, we explore the potential of unsupervised learning to find features that promote better generalization to settings outside the supervised training distribution. Our task is predicting the stability of towers of square blocks. We demonstrate that an unsupervised model, trained to predict future frames of a video sequence of stable and unstable block configurations, can yield features that support extrapolating stability prediction to blocks configurations outside the training set distribution", "keywords": "Unsupervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Mehdi Mirza;Aaron Courville;Yoshua Bengio", "authorids": "memirzamo@gmail.com;aaron.courville@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJ6DhP5xe", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14930606096187073461&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "rJ8Je4clg", "title": "Learning to Play in a Day: Faster Deep Reinforcement Learning by Optimality Tightening", "track": "main", "status": "Poster", "tldr": "We propose a novel training algorithm for reinforcement learning which combines the strength of deep Q-learning with a constrained optimization approach to tighten optimality and encourage faster reward propagation.", "abstract": "We propose a novel training algorithm for reinforcement learning which combines the strength of deep Q-learning with a constrained optimization approach to tighten optimality and encourage faster reward propagation. Our novel technique makes deep reinforcement learning more practical by drastically reducing the training time. We evaluate the performance of our approach on the 49 games of the challenging Arcade Learning Environment, and report significant improvements in both training time and accuracy.", "keywords": "Reinforcement Learning;Optimization;Games", "primary_area": "", "supplementary_material": "", "author": "Frank S.He;Yang Liu;Alexander G. Schwing;Jian Peng", "authorids": "frankheshibi@gmail.com;liu301@illinois.edu;aschwing@illinois.edu;jianpeng@illinois.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ns.he2017learning,\ntitle={Learning to Play in a Day: Faster Deep Reinforcement Learning by Optimality Tightening},\nauthor={Frank S.He and Yang Liu and Alexander G. Schwing and Jian Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJ8Je4clg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=rJ8Je4clg", "pdf_size": 0, "rating": "4;9;9", "confidence": "4;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 28, "authors#_avg": 4, "corr_rating_confidence": -0.5000000000000001, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10390782710136276082&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4 }, { "id": "rJ8uNptgl", "title": "Towards the Limit of Network Quantization", "track": "main", "status": "Poster", "tldr": "", "abstract": "Network quantization is one of network compression techniques to reduce the redundancy of deep neural networks. It reduces the number of distinct network parameter values by quantization in order to save the storage for them. In this paper, we design network quantization schemes that minimize the performance loss due to quantization given a compression ratio constraint. We analyze the quantitative relation of quantization errors to the neural network loss function and identify that the Hessian-weighted distortion measure is locally the right objective function for the optimization of network quantization. As a result, Hessian-weighted k-means clustering is proposed for clustering network parameters to quantize. When optimal variable-length binary codes, e.g., Huffman codes, are employed for further compression, we derive that the network quantization problem can be related to the entropy-constrained scalar quantization (ECSQ) problem in information theory and consequently propose two solutions of ECSQ for network quantization, i.e., uniform quantization and an iterative solution similar to Lloyd's algorithm. Finally, using the simple uniform quantization followed by Huffman coding, we show from our experiments that the compression ratios of 51.25, 22.17 and 40.65 are achievable for LeNet, 32-layer ResNet and AlexNet, respectively.", "keywords": "Theory;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Yoojin Choi;Mostafa El-Khamy;Jungwon Lee", "authorids": "yoojin.c@samsung.com;mostafa.e@samsung.com;jungwon2.lee@samsung.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchoi2017towards,\ntitle={Towards the Limit of Network Quantization},\nauthor={Yoojin Choi and Mostafa El-Khamy and Jungwon Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJ8uNptgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJ8uNptgl", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 259, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=702234881828234409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "rJEgeXFex", "title": "Predicting Medications from Diagnostic Codes with Recurrent Neural Networks", "track": "main", "status": "Poster", "tldr": "Applying recurrent neural networks to fix errors and omissions in patient medication records.", "abstract": "It is a surprising fact that electronic medical records are failing at one of their primary purposes, that of tracking the set of medications that the patient is actively taking. Studies estimate that up to 50% of such lists omit active drugs, and that up to 25% of all active medications do not appear on the appropriate patient list. Manual efforts to maintain these lists involve a great deal of tedious human labor, which could be reduced by computational tools to suggest likely missing or incorrect medications on a patient\u2019s list. We report here an application of recurrent neural networks to predict the likely therapeutic classes of medications that a patient is taking, given a sequence of the last 100 billing codes in their record. Our best model was a GRU that achieved high prediction accuracy (micro-averaged AUC 0.93, Label Ranking Loss 0.076), limited by hardware constraints on model size. Additionally, examining individual cases revealed that many of the predictions marked incorrect were likely to be examples of either omitted medications or omitted billing codes, supporting our assertion of a substantial number of errors and omissions in the data, and the likelihood of models such as these to help correct them.", "keywords": "Deep learning;Supervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Jacek M. Bajor;Thomas A. Lasko", "authorids": "jacek.m.bajor@vanderbilt.edu;tom.lasko@vanderbilt.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbajor2017predicting,\ntitle={Predicting Medications from Diagnostic Codes with Recurrent Neural Networks},\nauthor={Jacek M. Bajor and Thomas A. Lasko},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJEgeXFex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJEgeXFex", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;5;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": 0.5, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=449499058147949682&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "rJJ3YU5ge", "title": "Is a picture worth a thousand words? A Deep Multi-Modal Fusion Architecture for Product Classification in e-commerce", "track": "main", "status": "Reject", "tldr": "", "abstract": "Classifying products into categories precisely and efficiently is a major challenge in modern e-commerce. The high traffic of new products uploaded daily and the dynamic nature of the categories raise the need for machine learning models that can reduce the cost and time of human editors. In this paper, we propose a decision level fusion approach for multi-modal product classification using text and image inputs. We train input specific state-of-the-art deep neural networks for each input source, show the potential of forging them together into a multi-modal architecture and train a novel policy network that learns to choose between them. Finally, we demonstrate that our multi-modal network improves the top-1 accuracy $\\%$ over both networks on a real-world large-scale product classification dataset that we collected from Walmart.com. While we focus on image-text fusion that characterizes e-commerce domains, our algorithms can be easily applied to other modalities such as audio, video, physical sensors, etc.", "keywords": "Multi-modal learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Tom Zahavy;Alessandro Magnani;Abhinandan Krishnan;Shie Mannor", "authorids": "tomzahavy@tx.technion.ac.il;AMagnani@walmartlabs.com;AKrishnan@walmartlabs.com;shie@ee.technion.ac.il", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzahavy2017is,\ntitle={Is a picture worth a thousand words? A Deep Multi-Modal Fusion Architecture for Product Classification in e-commerce},\nauthor={Tom Zahavy and Alessandro Magnani and Abhinandan Krishnan and Shie Mannor},\nyear={2017},\nurl={https://openreview.net/forum?id=rJJ3YU5ge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJJ3YU5ge", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13795272429622617542&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "rJJRDvcex", "title": "Layer Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "We propose a Layer-RNN (L-RNN) network that is able to learn contextual information adaptively using within-layer recurrence. We further propose to insert L-RNN to pre-trained CNNs seamlessly.", "abstract": "In this paper, we propose a Layer-RNN (L-RNN) module that is able to learn contextual information adaptively using within-layer recurrence. Our contributions are three-fold: \n(i) we propose a hybrid neural network architecture that interleaves traditional convolutional layers with L-RNN module for learning long- range dependencies at multiple levels; \n(ii) we show that a L-RNN module can be seamlessly inserted into any convolutional layer of a pre-trained CNN, and the entire network then fine-tuned, leading to a boost in performance; \n(iii) we report experiments on the CIFAR-10 classification task, showing that a network with interleaved convolutional layers and L-RNN modules, achieves comparable results (5.39% top1 error) using only 15 layers and fewer parameters to ResNet-164 (5.46%); and on the PASCAL VOC2012 semantic segmentation task, we show that the performance of a pre-trained FCN network can be boosted by 5% (mean IOU) by simply inserting Layer-RNNs.", "keywords": "Deep learning;Computer vision", "primary_area": "", "supplementary_material": "", "author": "Weidi Xie;Alison Noble;Andrew Zisserman", "authorids": "weidi.xie@eng.ox.ac.uk;alison.noble@eng.ox.ac.uk;az@robots.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nxie2017layer,\ntitle={Layer Recurrent Neural Networks},\nauthor={Weidi Xie and Alison Noble and Andrew Zisserman},\nyear={2017},\nurl={https://openreview.net/forum?id=rJJRDvcex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJJRDvcex", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;4;4", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "rJLS7qKel", "title": "Learning to Act by Predicting the Future", "track": "main", "status": "Oral", "tldr": "We present an approach to sensorimotor control in immersive environments.", "abstract": "We present an approach to sensorimotor control in immersive environments. Our approach utilizes a high-dimensional sensory stream and a lower-dimensional measurement stream. The cotemporal structure of these streams provides a rich supervisory signal, which enables training a sensorimotor control model by interacting with the environment. The model is trained using supervised learning techniques, but without extraneous supervision. It learns to act based on raw sensory input from a complex three-dimensional environment. The presented formulation enables learning without a fixed goal at training time, and pursuing dynamically changing goals at test time. We conduct extensive experiments in three-dimensional simulations based on the classical first-person game Doom. The results demonstrate that the presented approach outperforms sophisticated prior formulations, particularly on challenging tasks. The results also show that trained models successfully generalize across environments and goals. A model trained using the presented approach won the Full Deathmatch track of the Visual Doom AI Competition, which was held in previously unseen environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Alexey Dosovitskiy;Vladlen Koltun", "authorids": "adosovitskiy@gmail.com;vkoltun@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ndosovitskiy2017learning,\ntitle={Learning to Act by Predicting the Future},\nauthor={Alexey Dosovitskiy and Vladlen Koltun},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJLS7qKel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJLS7qKel", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;4", "rating_avg": 7.666666666666667, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 353, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1561134489783191776&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "rJM69B5xx", "title": "Finding a Jack-of-All-Trades: An Examination of Semi-supervised Learning in Reading Comprehension", "track": "main", "status": "Reject", "tldr": "We examine effect of transfer learning in AS Reader model from two source domains (CNN/DM and BookTest) to two target domains (bAbI and SQuAD).", "abstract": "Deep learning has proven useful on many NLP tasks including reading\ncomprehension. However it requires a lot of training data which are not\navailable in some domains of application. Hence we examine the possibility\nof using data-rich domains to pre-train models and then apply them in\ndomains where training data are harder to get. Specifically, we train a\nneural-network-based model on two context-question-answer datasets, the\nBookTest and CNN/Daily Mail, and we monitor transfer to subsets of bAbI,\na set of artificial tasks designed to test specific reasoning abilities, and of\nSQuAD, a question-answering dataset which is much closer to real-world\napplications. Our experiments show very limited transfer if the model isn\u2019t\nshown any training examples from the target domain however the results\nare promising if the model is shown at least a few target-domain examples.\nFurthermore we show that the effect of pre-training is not limited to word\nembeddings.", "keywords": "Natural language processing;Semi-Supervised Learning;Deep learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Rudolf Kadlec;Ond\u0159ej Bajgar;Peter Hrincar;Jan Kleindienst", "authorids": "rudolf_kadlec@cz.ibm.com;obajgar@cz.ibm.com;phrincar@cz.ibm.com;jankle@cz.ibm.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkadlec2017finding,\ntitle={Finding a Jack-of-All-Trades: An Examination of Semi-supervised Learning in Reading Comprehension},\nauthor={Rudolf Kadlec and Ond{\\v{r}}ej Bajgar and Peter Hrincar and Jan Kleindienst},\nyear={2017},\nurl={https://openreview.net/forum?id=rJM69B5xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJM69B5xx", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 21, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12568929575793041716&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJPcZ3txx", "title": "Faster CNNs with Direct Sparse Convolutions and Guided Pruning", "track": "main", "status": "Poster", "tldr": "Highly-performance sparse convolution outperforms dense with only 70% sparsity. Performance model that guides training to find useful sparsity range, applied to AlexNet and GoogLeNet", "abstract": "Phenomenally successful in practical inference problems, convolutional neural networks (CNN) are widely deployed in mobile devices, data centers, and even supercomputers.\nThe number of parameters needed in CNNs, however, are often large and undesirable. Consequently, various methods have been developed to prune a CNN once it is trained. \nNevertheless, the resulting CNNs offer limited benefits. While pruning the fully connected layers reduces a CNN's size considerably, it does not improve inference speed noticeably as the compute heavy parts lie in convolutions. Pruning CNNs in a way that increase inference speed often imposes specific sparsity structures, thus limiting the achievable sparsity levels.\n\nWe present a method to realize simultaneously size economy and speed improvement while pruning CNNs. Paramount to our success is an efficient general sparse-with-dense matrix\nmultiplication implementation that is applicable to convolution of feature maps with kernels of arbitrary sparsity patterns. Complementing this, we developed a performance model that predicts sweet spots of sparsity levels for different layers and on different computer architectures. Together, these two allow us to demonstrate 3.1-7.3x convolution speedups over dense convolution in AlexNet, on Intel Atom, Xeon, and Xeon Phi processors, spanning the spectrum from mobile devices to supercomputers.\n", "keywords": "Deep learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Jongsoo Park;Sheng Li;Wei Wen;Ping Tak Peter Tang;Hai Li;Yiran Chen;Pradeep Dubey", "authorids": "jongsoo.park@intel.com;sheng.r.li@intel.com;peter.tang@intel.com;weiwen.web@gmail.com;HAL66@pitt.edu;yic52@pitt.edu;pradeep.dubey@intel.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\npark2017faster,\ntitle={Faster {CNN}s with Direct Sparse Convolutions and Guided Pruning},\nauthor={Jongsoo Park and Sheng Li and Wei Wen and Ping Tak Peter Tang and Hai Li and Yiran Chen and Pradeep Dubey},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJPcZ3txx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJPcZ3txx", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.0, "replies_avg": 7, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 307, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=934070388873043586&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rJQKYt5ll", "title": "Steerable CNNs", "track": "main", "status": "Poster", "tldr": "", "abstract": "It has long been recognized that the invariance and equivariance properties of a representation are critically important for success in many vision tasks. In this paper we present Steerable Convolutional Neural Networks, an efficient and flexible class of equivariant convolutional networks. We show that steerable CNNs achieve state of the art results on the CIFAR image classification benchmark. The mathematical theory of steerable representations reveals a type system in which any steerable representation is a composition of elementary feature types, each one associated with a particular kind of symmetry. We show how the parameter cost of a steerable filter bank depends on the types of the input and output features, and show how to use this knowledge to construct CNNs that utilize parameters effectively.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Taco S. Cohen;Max Welling", "authorids": "taco.cohen@gmail.com;m.welling@uva.nl", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ncohen2017steerable,\ntitle={Steerable {CNN}s},\nauthor={Taco S. Cohen and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJQKYt5ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJQKYt5ll", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;3", "rating_avg": 7.0, "confidence_avg": 3.3333333333333335, "replies_avg": 16, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 618, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11877249517551081982&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rJRhzzKxl", "title": "Knowledge Adaptation: Teaching to Adapt", "track": "main", "status": "Reject", "tldr": "We propose a teacher-student framework for domain adaptation together with a novel confidence measure that achieves state-of-the-art results on single-source and multi-source adaptation on a standard sentiment analysis benchmark.", "abstract": "Domain adaptation is crucial in many real-world applications where the distribution of the training data differs from the distribution of the test data. Previous Deep Learning-based approaches to domain adaptation need to be trained jointly on source and target domain data and are therefore unappealing in scenarios where models need to be adapted to a large number of domains or where a domain is evolving, e.g. spam detection where attackers continuously change their tactics.\n\nTo fill this gap, we propose Knowledge Adaptation, an extension of Knowledge Distillation (Bucilua et al., 2006; Hinton et al., 2015) to the domain adaptation scenario. We show how a student model achieves state-of-the-art results on unsupervised domain adaptation from multiple sources on a standard sentiment analysis benchmark by taking into account the domain-specific expertise of multiple teachers and the similarities between their domains.\n\nWhen learning from a single teacher, using domain similarity to gauge trustworthiness is inadequate. To this end, we propose a simple metric that correlates well with the teacher's accuracy in the target domain. We demonstrate that incorporating high-confidence examples selected by this metric enables the student model to achieve state-of-the-art performance in the single-source scenario.", "keywords": "Natural language processing;Deep learning;Transfer Learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Sebastian Ruder;Parsa Ghaffari;John G. Breslin", "authorids": "sebastian.ruder@insight-centre.org;parsa@aylien.com;john.breslin@insight-centre.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nruder2017knowledge,\ntitle={Knowledge Adaptation: Teaching to Adapt},\nauthor={Sebastian Ruder and Parsa Ghaffari and John G. Breslin},\nyear={2017},\nurl={https://openreview.net/forum?id=rJRhzzKxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJRhzzKxl", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3646415111768249558&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rJTKKKqeg", "title": "Tracking the World State with Recurrent Entity Networks", "track": "main", "status": "Poster", "tldr": "A new memory-augmented model which learns to track the world state, obtaining SOTA on the bAbI tasks amongst other results.", "abstract": "We introduce a new model, the Recurrent Entity Network (EntNet). It is equipped\nwith a dynamic long-term memory which allows it to maintain and update a rep-\nresentation of the state of the world as it receives new data. For language under-\nstanding tasks, it can reason on-the-fly as it reads text, not just when it is required\nto answer a question or respond as is the case for a Memory Network (Sukhbaatar\net al., 2015). Like a Neural Turing Machine or Differentiable Neural Computer\n(Graves et al., 2014; 2016) it maintains a fixed size memory and can learn to\nperform location and content-based read and write operations. However, unlike\nthose models it has a simple parallel architecture in which several memory loca-\ntions can be updated simultaneously. The EntNet sets a new state-of-the-art on\nthe bAbI tasks, and is the first method to solve all the tasks in the 10k training\nexamples setting. We also demonstrate that it can solve a reasoning task which\nrequires a large number of supporting facts, which other methods are not able to\nsolve, and can generalize past its training horizon. It can also be practically used\non large scale datasets such as Children\u2019s Book Test, where it obtains competitive\nperformance, reading the story in a single pass.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Mikael Henaff;Jason Weston;Arthur Szlam;Antoine Bordes;Yann LeCun", "authorids": "mbh305@nyu.edu;jase@fb.com;azslam@fb.com;abordes@fb.com;yann@fb.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nhenaff2017tracking,\ntitle={Tracking the World State with Recurrent Entity Networks},\nauthor={Mikael Henaff and Jason Weston and Arthur Szlam and Antoine Bordes and Yann LeCun},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJTKKKqeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJTKKKqeg", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;5", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 298, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15954480746316535959&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "rJXTf9Bxg", "title": "Conditional Image Synthesis With Auxiliary Classifier GANs", "track": "main", "status": "Reject", "tldr": "We introduce a special GAN architecture that results in high quality 128x128 ImageNet samples; we introduce 2 new quantitative metrics of sample quality.", "abstract": "Synthesizing high resolution photorealistic images has been a long-standing challenge in machine learning. In this paper we introduce new methods for the improved training of generative adversarial networks (GANs) for image synthesis. We construct a variant of GANs employing label conditioning that results in 128x128 resolution image samples exhibiting global coherence. We expand on previous work for image quality assessment to provide two new analyses for assessing the discriminability and diversity of samples from class-conditional image synthesis models. These analyses demonstrate that high resolution samples provide class information not present in low resolution samples. Across 1000 ImageNet classes, 128x128 samples are more than twice as discriminable as artificially resized 32x32 samples. In addition, 84.7% of the classes have samples exhibiting diversity comparable to real ImageNet data.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Augustus Odena;Christopher Olah;Jonathon Shlens", "authorids": "augustusodena@google.com;colah@google.com;shlens@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nodena2017conditional,\ntitle={Conditional Image Synthesis With Auxiliary Classifier {GAN}s},\nauthor={Augustus Odena and Christopher Olah and Jonathon Shlens},\nyear={2017},\nurl={https://openreview.net/forum?id=rJXTf9Bxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJXTf9Bxg", "pdf_size": 0, "rating": "3;6;6", "confidence": "4;5;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.5, "gs_citation": 4527, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14828291299960415366&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "rJY0-Kcll", "title": "Optimization as a Model for Few-Shot Learning", "track": "main", "status": "Oral", "tldr": "We propose an LSTM-based meta-learner model to learn the exact optimization algorithm used to train another learner neural network in the few-shot regime", "abstract": "Though deep neural networks have shown great success in the large data domain, they generally perform poorly on few-shot learning tasks, where a model has to quickly generalize after seeing very few examples from each class. The general belief is that gradient-based optimization in high capacity models requires many iterative steps over many examples to perform well. Here, we propose an LSTM-based meta-learner model to learn the exact optimization algorithm used to train another learner neural network in the few-shot regime. The parametrization of our model allows it to learn appropriate parameter updates specifically for the scenario where a set amount of updates will be made, while also learning a general initialization of the learner network that allows for quick convergence of training. We demonstrate that this meta-learning model is competitive with deep metric-learning techniques for few-shot learning. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sachin Ravi;Hugo Larochelle", "authorids": "sachinr@twitter.com;", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nravi2017optimization,\ntitle={Optimization as a Model for Few-Shot Learning},\nauthor={Sachin Ravi and Hugo Larochelle},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJY0-Kcll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJY0-Kcll", "pdf_size": 0, "rating": "6;8;9", "confidence": "4;4;5", "rating_avg": 7.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 17, "authors#_avg": 2, "corr_rating_confidence": 0.7559289460184544, "gs_citation": 4112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9936470188259741299&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "rJY3vK9eg", "title": "Neural Combinatorial Optimization with Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "This paper presents a framework to tackle combinatorial optimization problems using neural networks and reinforcement learning.", "abstract": "This paper presents a framework to tackle combinatorial optimization problems using neural networks and reinforcement learning. We focus on the traveling salesman problem (TSP) and train a recurrent neural network that, given a set of city coordinates, predicts a distribution over different city permutations. Using negative tour length as the reward signal, we optimize the parameters of the recurrent neural network using a policy gradient method. We compare learning the network parameters on a set of training graphs against learning them on individual test graphs. Without much engineering and heuristic designing, Neural Combinatorial Optimization achieves close to optimal results on 2D Euclidean graphs with up to 100 nodes. Applied to the KnapSack, another NP-hard problem, the same method obtains optimal solutions for instances with up to 200 items. These results, albeit still far from state-of-the-art, give insights into how neural networks can be used as a general tool for tackling combinatorial optimization problems.", "keywords": "Reinforcement Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Irwan Bello*;Hieu Pham*;Quoc V. Le;Mohammad Norouzi;Samy Bengio", "authorids": "ibello@google.com;hyhieu@google.com;qvl@google.com;mnorouzi@google.com;bengio@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nbello*2017neural,\ntitle={Neural Combinatorial Optimization with Reinforcement Learning},\nauthor={Irwan Bello* and Hieu Pham* and Quoc V. Le and Mohammad Norouzi and Samy Bengio},\nyear={2017},\nurl={https://openreview.net/forum?id=rJY3vK9eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJY3vK9eg", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 34, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 2184, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13936411244086798707&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "rJbPBt9lg", "title": "Neural Code Completion", "track": "main", "status": "Reject", "tldr": "", "abstract": "Code completion, an essential part of modern software development, yet can bechallenging for dynamically typed programming languages. In this paper we ex-plore the use of neural network techniques to automatically learn code completionfrom a large corpus of dynamically typed JavaScript code. We show differentneural networks that leverage not only token level information but also structuralinformation, and evaluate their performance on different prediction tasks. Wedemonstrate that our models can outperform the state-of-the-art approach, whichis based on decision tree techniques, on both next non-terminal and next terminalprediction tasks by 3.8 points and 0.5 points respectively. We believe that neuralnetwork techniques can play a transformative role in helping software developersmanage the growing complexity of software systems, and we see this work as afirst step in that direction.", "keywords": "Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Chang Liu;Xin Wang;Richard Shin;Joseph E. Gonzalez;Dawn Song", "authorids": "xinw@eecs.berkeley.edu;liuchang@eecs.berkeley.edu;ricshin@berkeley.edu;jegonzal@berkeley.edu;dawnsong@cs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nliu2017neural,\ntitle={Neural Code Completion},\nauthor={Chang Liu and Xin Wang and Richard Shin and Joseph E. Gonzalez and Dawn Song},\nyear={2017},\nurl={https://openreview.net/forum?id=rJbPBt9lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJbPBt9lg", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7829727492476022426&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJbbOLcex", "title": "TopicRNN: A Recurrent Neural Network with Long-Range Semantic Dependency", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this paper, we propose TopicRNN, a recurrent neural network (RNN)-based language model designed to directly capture the global semantic meaning relating words in a document via latent topics. Because of their sequential nature, RNNs are good at capturing the local structure of a word sequence \u2013 both semantic and syntactic \u2013 but might face difficulty remembering long-range dependencies. Intuitively, these long-range dependencies are of semantic nature. In contrast, latent topic models are able to capture the global underlying semantic structure of a document but do not account for word ordering. The proposed TopicRNN model integrates the merits of RNNs and latent topic models: it captures local (syntactic) dependencies using an RNN and global (semantic) dependencies using latent topics. Unlike previous work on contextual RNN language modeling, our model is learned end-to-end. Empirical results on word prediction show that TopicRNN outperforms existing contextual RNN baselines. In addition, TopicRNN can be used as an unsupervised feature extractor for documents. We do this for sentiment analysis on the IMDB movie review dataset and report an error rate of 6.28%. This is comparable to the state-of-the-art 5.91% resulting from a semi-supervised approach. Finally, TopicRNN also yields sensible topics, making it a useful alternative to document models such as latent Dirichlet allocation.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Adji B. Dieng;Chong Wang;Jianfeng Gao;John Paisley", "authorids": "abd2141@columbia.edu;chowang@microsoft.com;jfgao@microsoft.com;jpaisley@columbia.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ndieng2017topicrnn,\ntitle={Topic{RNN}: A Recurrent Neural Network with Long-Range Semantic Dependency},\nauthor={Adji B. Dieng and Chong Wang and Jianfeng Gao and John Paisley},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJbbOLcex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJbbOLcex", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;4", "rating_avg": 7.0, "confidence_avg": 3.6666666666666665, "replies_avg": 18, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 310, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10650506792701244256&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rJe-Pr9le", "title": "Multi-task learning with deep model based reinforcement learning", "track": "main", "status": "Reject", "tldr": "We build a world model, based on CNN's and RNN's, to play multiple ATARI games simultaneously, achieving super-human performance.", "abstract": "In recent years, model-free methods that use deep learning have achieved great success in many different reinforcement learning environments. Most successful approaches focus on solving a single task, while multi-task reinforcement learning remains an open problem. In this paper, we present a model based approach to deep reinforcement learning which we use to solve different tasks simultaneously. We show that our approach not only does not degrade but actually benefits from learning multiple tasks. For our model, we also present a new kind of recurrent neural network inspired by residual networks that decouples memory from computation allowing to model complex environments that do not require lots of memory. The code will be released before ICLR 2017.", "keywords": "Reinforcement Learning;Deep learning;Games;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Asier Mujika", "authorids": "asierm@student.ethz.ch", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nmujika2017multitask,\ntitle={Multi-task learning with deep model based reinforcement learning},\nauthor={Asier Mujika},\nyear={2017},\nurl={https://openreview.net/forum?id=rJe-Pr9le}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJe-Pr9le", "pdf_size": 0, "rating": "2;4;4", "confidence": "5;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 1, "corr_rating_confidence": -1.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6424348612311443655&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rJeKjwvclx", "title": "Dynamic Coattention Networks For Question Answering", "track": "main", "status": "Poster", "tldr": "An end-to-end dynamic neural network model for question answering that achieves the state of the art and best leaderboard performance on the Stanford QA dataset.", "abstract": "Several deep learning models have been proposed for question answering. How- ever, due to their single-pass nature, they have no way to recover from local maxima corresponding to incorrect answers. To address this problem, we introduce the Dynamic Coattention Network (DCN) for question answering. The DCN first fuses co-dependent representations of the question and the document in order to focus on relevant parts of both. Then a dynamic pointer decoder iterates over potential answer spans. This iterative procedure enables the model to recover from initial local maxima corresponding to incorrect answers. On the Stanford question answering dataset, a single DCN model improves the previous state of the art from 71.0% F1 to 75.9%, while a DCN ensemble obtains 80.4% F1.", "keywords": "Natural language processing;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Caiming Xiong;Victor Zhong;Richard Socher", "authorids": "cxiong@salesforce.com;vzhong@salesforce.com;rsocher@salesforce.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nxiong2017dynamic,\ntitle={Dynamic Coattention Networks For Question Answering},\nauthor={Caiming Xiong and Victor Zhong and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJeKjwvclx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJeKjwvclx", "pdf_size": 0, "rating": "8;8;8", "confidence": "4;3;4", "rating_avg": 8.0, "confidence_avg": 3.6666666666666665, "replies_avg": 32, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 747, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7601883970857280608&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rJfMusFll", "title": "Batch Policy Gradient Methods for Improving Neural Conversation Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "We study reinforcement learning of chat-bots with recurrent neural network\narchitectures when the rewards are noisy and expensive to\nobtain. For instance, a chat-bot used in automated customer service support can\nbe scored by quality assurance agents, but this process can be expensive, time consuming\nand noisy. \nPrevious reinforcement learning work for natural language uses on-policy updates\nand/or is designed for on-line learning settings.\nWe demonstrate empirically that such strategies are not appropriate for this setting\nand develop an off-policy batch policy gradient method (\\bpg).\nWe demonstrate the efficacy of our method via a series of\nsynthetic experiments and an Amazon Mechanical Turk experiment on\na restaurant recommendations dataset.\n\n", "keywords": "Natural language processing;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Kirthevasan Kandasamy;Yoram Bachrach;Ryota Tomioka;Daniel Tarlow;David Carter", "authorids": "kandasamy@cmu.edu;yorambac@gmail.com;ryoto@microsoft.com;dtarlow@microsoft.com;dacart@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nkandasamy2017batch,\ntitle={Batch Policy Gradient Methods for Improving Neural Conversation Models},\nauthor={Kirthevasan Kandasamy and Yoram Bachrach and Ryota Tomioka and Daniel Tarlow and David Carter},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJfMusFll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJfMusFll", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;3", "rating_avg": 7.0, "confidence_avg": 3.0, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=440880178084845028&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4 }, { "id": "rJg_1L5gg", "title": "Incremental Sequence Learning", "track": "main", "status": "Reject", "tldr": "We investigate a technique for sequence learning where the initial parts of the sequences are learned first; this is found to not only greatly speed up learning, but moreover to strongly improve generalization performance.", "abstract": "Deep learning research over the past years has shown that by increasing the scope or difficulty of the learning problem over time, increasingly complex learning problems can be addressed. We study incremental learning in the context of sequence learning, using generative RNNs in the form of multi-layer recurrent Mixture Density Networks. While the potential of incremental or curriculum learning to enhance learning is known, indiscriminate application of the principle does not necessarily lead to improvement, and it is essential therefore to know which forms of incremental or curriculum learning have a positive effect. This research contributes to that aim by comparing three instantiations of incremental or curriculum learning.\n\nWe introduce Incremental Sequence Learning, a simple incremental approach to sequence learning. Incremental Sequence Learning starts out by using only the first few steps of each sequence as training data. Each time a performance criterion has been reached, the length of the parts of the sequences used for training is increased.\n\nWe introduce and make available a novel sequence learning task and data set: predicting and classifying MNIST pen stroke sequences. We find that Incremental Sequence Learning greatly speeds up sequence learning and reaches the best test performance level of regular sequence learning 20 times faster, reduces the test error by 74%, and in general performs more robustly; it displays lower variance and achieves sustained progress after all three comparison methods have stopped improving. The other instantiations of curriculum learning do not result in any noticeable improvement. A trained sequence prediction model is also used in transfer learning to the task of sequence classification, where it is found that transfer learning realizes improved classification performance compared to methods that learn to classify from scratch.\n", "keywords": "Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Edwin D. de Jong", "authorids": "edwin.webmail@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\njong2017incremental,\ntitle={Incremental Sequence Learning},\nauthor={Edwin D. de Jong},\nyear={2017},\nurl={https://openreview.net/forum?id=rJg_1L5gg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=rJg_1L5gg", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 17, "authors#_avg": 1, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6814695179462078545&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "rJiNwv9gg", "title": "Lossy Image Compression with Compressive Autoencoders", "track": "main", "status": "Poster", "tldr": "A simple approach to train autoencoders to compress images as well or better than JPEG 2000.", "abstract": "We propose a new approach to the problem of optimizing autoencoders for lossy image compression. New media formats, changing hardware technology, as well as diverse requirements and content types create a need for compression algorithms which are more flexible than existing codecs. Autoencoders have the potential to address this need, but are difficult to optimize directly due to the inherent non-differentiabilty of the compression loss. We here show that minimal changes to the loss are sufficient to train deep autoencoders competitive with JPEG 2000 and outperforming recently proposed approaches based on RNNs. Our network is furthermore computationally efficient thanks to a sub-pixel architecture, which makes it suitable for high-resolution images. This is in contrast to previous work on autoencoders for compression using coarser approximations, shallower architectures, computationally expensive methods, or focusing on small images.", "keywords": "Computer vision;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Lucas Theis;Wenzhe Shi;Andrew Cunningham;Ferenc Husz\u00e1r", "authorids": "ltheis@twitter.com;wshi@twitter.com;acunningham@twitter.com;fhuszar@twitter.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ntheis2017lossy,\ntitle={Lossy Image Compression with Compressive Autoencoders},\nauthor={Lucas Theis and Wenzhe Shi and Andrew Cunningham and Ferenc Husz{\\'a}r},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJiNwv9gg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJiNwv9gg", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;3;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 31, "authors#_avg": 4, "corr_rating_confidence": 0.3273268353539886, "gs_citation": 1381, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13226490013777095959&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "rJo9n9Feg", "title": "Chess Game Concepts Emerge under Weak Supervision: A Case Study of Tic-tac-toe", "track": "main", "status": "Reject", "tldr": "investigating whether a CNN understands concepts from a new perspective", "abstract": "This paper explores the possibility of learning chess game concepts under weak supervision with convolutional neural networks, which is a topic that has not been visited to the best of our knowledge. We put this task in three different backgrounds: (1) deep reinforcement learning has shown an amazing capability to learn a mapping from visual inputs to most rewarding actions, without knowing the concepts of a video game. But how could we confirm that the network understands these concepts or it just does not? (2) cross-modal supervision for visual representation learning draws much attention recently. Is this methodology still applicable when it comes to the domain of game concepts and actions? (3) class activation mapping is widely recognized as a visualization technique to help us understand what a network has learnt. Is it possible for it to activate at non-salient regions? With the simplest chess game tic-tac-toe, we report interesting results as answers to those three questions mentioned above. All codes, pre-processed datasets and pre-trained models will be released.", "keywords": "Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Hao Zhao;Ming Lu;Anbang Yao;Yurong Chen;Li Zhang", "authorids": "zhao-h13@mails.tsinghua.edu.cn;lu-m13@mails.tsinghua.edu.cn;anbang.yao@intel.com;yurong.chen@intel.com;chinazhangli@mail.tsinghua.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhao2017chess,\ntitle={Chess Game Concepts Emerge under Weak Supervision: A Case Study of Tic-tac-toe},\nauthor={Hao Zhao and Ming Lu and Anbang Yao and Yurong Chen and Li Zhang},\nyear={2017},\nurl={https://openreview.net/forum?id=rJo9n9Feg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJo9n9Feg", "pdf_size": 0, "rating": "3;3;3", "confidence": "2;3;5", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nu5-4unpXrQJ:scholar.google.com/&scioq=Chess+Game+Concepts+Emerge+under+Weak+Supervision:+A+Case+Study+of+Tic-tac-toe&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJqBEPcxe", "title": "Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations", "track": "main", "status": "Poster", "tldr": "Zoneout is like dropout (for RNNs) but uses identity masks instead of zero masks", "abstract": "We propose zoneout, a novel method for regularizing RNNs.\nAt each timestep, zoneout stochastically forces some hidden units to maintain their previous values.\nLike dropout, zoneout uses random noise to train a pseudo-ensemble, improving generalization.\nBut by preserving instead of dropping hidden units, gradient information and state information are more readily propagated through time, as in feedforward stochastic depth networks.\nWe perform an empirical investigation of various RNN regularizers, and find that zoneout gives significant performance improvements across tasks. We achieve competitive results with relatively simple models in character- and word-level language modelling on the Penn Treebank and Text8 datasets, and combining with recurrent batch normalization yields state-of-the-art results on permuted sequential MNIST.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "David Krueger;Tegan Maharaj;Janos Kramar;Mohammad Pezeshki;Nicolas Ballas;Nan Rosemary Ke;Anirudh Goyal;Yoshua Bengio;Aaron Courville;Christopher Pal", "authorids": "davidscottkrueger@gmail.com;;;;;;;;;", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\nkrueger2017zoneout,\ntitle={Zoneout: Regularizing {RNN}s by Randomly Preserving Hidden Activations},\nauthor={David Krueger and Tegan Maharaj and Janos Kramar and Mohammad Pezeshki and Nicolas Ballas and Nan Rosemary Ke and Anirudh Goyal and Yoshua Bengio and Aaron Courville and Christopher Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJqBEPcxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJqBEPcxe", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;5", "rating_avg": 7.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 10, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 397, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13800900548977291683&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "rJqFGTslg", "title": "Pruning Filters for Efficient ConvNets", "track": "main", "status": "Poster", "tldr": "", "abstract": "The success of CNNs in various applications is accompanied by a significant increase in the computation and parameter storage costs. Recent efforts toward reducing these overheads involve pruning and compressing the weights of various layers without hurting original accuracy. However, magnitude-based pruning of weights reduces a significant number of parameters from the fully connected layers and may not adequately reduce the computation costs in the convolutional layers due to irregular sparsity in the pruned networks. We present an acceleration method for CNNs, where we prune filters from CNNs that are identified as having a small effect on the output accuracy. By removing whole filters in the network together with their connecting feature maps, the computation costs are reduced significantly. In contrast to pruning weights, this approach does not result in sparse connectivity patterns. Hence, it does not need the support of sparse convolution libraries and can work with existing efficient BLAS libraries for dense matrix multiplications. We show that even simple filter pruning techniques can reduce inference costs for VGG-16 by up to 34% and ResNet-110 by up to 38% on CIFAR10 while regaining close to the original accuracy by retraining the networks. ", "keywords": "Computer vision;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Hao Li;Asim Kadav;Igor Durdanovic;Hanan Samet;Hans Peter Graf", "authorids": "haoli@cs.umd.edu;asim@nec-labs.com;igord@nec-labs.com;hjs@cs.umd.edu;hpg@nec-labs.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nli2017pruning,\ntitle={Pruning Filters for Efficient ConvNets},\nauthor={Hao Li and Asim Kadav and Igor Durdanovic and Hanan Samet and Hans Peter Graf},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJqFGTslg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=rJqFGTslg", "pdf_size": 0, "rating": "6;7;7;7", "confidence": "5;5;4;4", "rating_avg": 6.75, "confidence_avg": 4.5, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3353206785042646074&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "rJq_YBqxx", "title": "Deep Character-Level Neural Machine Translation By Learning Morphology", "track": "main", "status": "Reject", "tldr": "We devise a character-level neural machine translation built on six recurrent networks, and obtain a BLEU score comparable to the state-of-the-art NMT on En-Fr and Cs-En translation tasks. ", "abstract": "Neural machine translation aims at building a single large neural network that can be trained to maximize translation performance. The encoder-decoder architecture with an attention mechanism achieves a translation performance comparable to the existing state-of-the-art phrase-based systems. However, the use of large vocabulary becomes the bottleneck in both training and improving the performance. In this paper, we propose a novel architecture which learns morphology by using two recurrent networks and a hierarchical decoder which translates at character level. This gives rise to a deep character-level model consisting of six recurrent networks. Such a deep model has two major advantages. It avoids the large vocabulary issue radically; at the same time, it is more efficient in training than word-based models. Our model obtains a higher BLEU score than the bpe-based model after training for one epoch on En-Fr and En-Cs translation tasks. Further analyses show that our model is able to learn morphology.\n\n", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Shenjian Zhao;Zhihua Zhang", "authorids": "sword.york@gmail.com;zhzhang@math.pku.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhao2017deep,\ntitle={Deep Character-Level Neural Machine Translation By Learning Morphology},\nauthor={Shenjian Zhao and Zhihua Zhang},\nyear={2017},\nurl={https://openreview.net/forum?id=rJq_YBqxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJq_YBqxx", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;4;4", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 19, "authors#_avg": 2, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15392606863335349367&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rJsiFTYex", "title": "A Way out of the Odyssey: Analyzing and Combining Recent Insights for LSTMs", "track": "main", "status": "Reject", "tldr": "Relatively simple augmentations to the LSTM, such as Monte Carlo test time averaging, deep vector averaging, and residual connections, can yield massive accuracy improvements on text classification datasets.", "abstract": "LSTMs have become a basic building block for many deep NLP models. In recent years, many improvements and variations have been proposed for deep sequence models in general, and LSTMs in particular. We propose and analyze a series of architectural modifications for LSTM networks resulting in improved performance for text classification datasets. We observe compounding improvements on traditional LSTMs using Monte Carlo test-time model averaging, deep vector averaging (DVA), and residual connections, along with four other suggested modifications. Our analysis provides a simple, reliable, and high quality baseline model.", "keywords": "Natural language processing;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Shayne Longpre;Sabeek Pradhan;Caiming Xiong;Richard Socher", "authorids": "slongpre@cs.stanford.edu;sabeekp@cs.stanford.edu;cxiong@salesforce.com;rsocher@salesforce.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlongpre2017a,\ntitle={A Way out of the Odyssey: Analyzing and Combining Recent Insights for {LSTM}s},\nauthor={Shayne Longpre and Sabeek Pradhan and Caiming Xiong and Richard Socher},\nyear={2017},\nurl={https://openreview.net/forum?id=rJsiFTYex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJsiFTYex", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;4", "rating_avg": 5.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16355454685385050151&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "rJxDkvqee", "title": "Multi-view Recurrent Neural Acoustic Word Embeddings", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent work has begun exploring neural acoustic word embeddings\u2013fixed dimensional vector representations of arbitrary-length speech segments corresponding to words. Such embeddings are applicable to speech retrieval and recognition tasks, where reasoning about whole words may make it possible to avoid ambiguous sub-word representations. The main idea is to map acoustic sequences to fixed-dimensional vectors such that examples of the same word are mapped to similar vectors, while different-word examples are mapped to very different vectors. In this work we take a multi-view approach to learning acoustic word embeddings, in which we jointly learn to embed acoustic sequences and their corresponding character sequences. We use deep bidirectional LSTM embedding models and multi-view contrastive losses. We study the effect of different loss variants, including fixed-margin and cost-sensitive losses. Our acoustic word embeddings improve over previous approaches for the task of word discrimination. We also present results on other tasks that are enabled by the multi-view approach, including cross-view word discrimination and word similarity.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wanjia He;Weiran Wang;Karen Livescu", "authorids": "wanjia@ttic.edu;weiranwang@ttic.edu;klivescu@ttic.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhe2017multiview,\ntitle={Multi-view Recurrent Neural Acoustic Word Embeddings},\nauthor={Wanjia He and Weiran Wang and Karen Livescu},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJxDkvqee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJxDkvqee", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;4", "rating_avg": 5.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18084114585220155330&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rJxdQ3jeg", "title": "End-to-end Optimized Image Compression", "track": "main", "status": "Oral", "tldr": "", "abstract": "We describe an image compression method, consisting of a nonlinear analysis transformation, a uniform quantizer, and a nonlinear synthesis transformation. The transforms are constructed in three successive stages of convolutional linear filters and nonlinear activation functions. Unlike most convolutional neural networks, the joint nonlinearity is chosen to implement a form of local gain control, inspired by those used to model biological neurons. Using a variant of stochastic gradient descent, we jointly optimize the entire model for rate-distortion performance over a database of training images, introducing a continuous proxy for the discontinuous loss function arising from the quantizer. Under certain conditions, the relaxed loss function may be interpreted as the log likelihood of a generative model, as implemented by a variational autoencoder. Unlike these models, however, the compression model must operate at any given point along the rate-distortion curve, as specified by a trade-off parameter. Across an independent set of test images, we find that the optimized method generally exhibits better rate-distortion performance than the standard JPEG and JPEG 2000 compression methods. More importantly, we observe a dramatic improvement in visual quality for all images at all bit rates, which is supported by objective quality estimates using MS-SSIM.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Johannes Ball\u00e9;Valero Laparra;Eero P. Simoncelli", "authorids": "johannes.balle@nyu.edu;valero.laparra@uv.es;eero.simoncelli@nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nball{\\'e}2017endtoend,\ntitle={End-to-end Optimized Image Compression},\nauthor={Johannes Ball{\\'e} and Valero Laparra and Eero P. Simoncelli},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rJxdQ3jeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=rJxdQ3jeg", "pdf_size": 0, "rating": "8;8;8;9", "confidence": "3;4;4;4", "rating_avg": 8.25, "confidence_avg": 3.75, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 2179, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1152338433659809765&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13 }, { "id": "rJzaDdYxx", "title": "Gradients of Counterfactuals", "track": "main", "status": "Reject", "tldr": "A method for identifying feature importance in deep networks using gradients of counterfactual inputs", "abstract": "Gradients have been used to quantify feature importance in machine learning models. Unfortunately, in nonlinear deep networks, not only individual neurons but also the whole network can saturate, and as a result an important input feature can have a tiny gradient. We study various networks, and observe that this phenomena is indeed widespread, across many inputs.\n\nWe propose to examine interior gradients, which are gradients of counterfactual inputs constructed by scaling down the original input. We apply our method to the GoogleNet architecture for object recognition in images, as well as a ligand-based virtual screening network with categorical features and an LSTM based language model for the Penn Treebank dataset. We visualize how interior gradients better capture feature importance. Furthermore, interior gradients are applicable to a wide variety of deep networks, and have the attribution property that the feature importance scores sum to the the prediction score. \n\nBest of all, interior gradients can be computed just as easily as gradients. In contrast, previous methods are complex to implement, which hinders practical adoption.", "keywords": "Deep learning;Computer vision;Theory", "primary_area": "", "supplementary_material": "", "author": "Mukund Sundararajan;Ankur Taly;Qiqi Yan", "authorids": "mukunds@google.com;ataly@google.com;qiqiyan@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsundararajan2017gradients,\ntitle={Gradients of Counterfactuals},\nauthor={Mukund Sundararajan and Ankur Taly and Qiqi Yan},\nyear={2017},\nurl={https://openreview.net/forum?id=rJzaDdYxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJzaDdYxx", "pdf_size": 0, "rating": "3;3;5", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 150, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13443875828607020653&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rk5upnsxe", "title": "Normalizing the Normalizers: Comparing and Extending Network Normalization Schemes", "track": "main", "status": "Poster", "tldr": "", "abstract": "Normalization techniques have only recently begun to be exploited in supervised learning tasks. Batch normalization exploits mini-batch statistics to normalize the activations. This was shown to speed up training and result in better models. However its success has been very limited when dealing with recurrent neural networks. On the other hand, layer normalization normalizes the activations across all activities within a layer. This was shown to work well in the recurrent setting. In this paper we propose a unified view of normalization techniques, as forms of divisive normalization, which includes layer and batch normalization as special cases. Our second contribution is the finding that a small modification to these normalization schemes, in conjunction with a sparse regularizer on the activations, leads to significant benefits over standard normalization techniques. We demonstrate the effectiveness of our unified divisive normalization framework in the context of convolutional neural nets and recurrent neural networks, showing improvements over baselines in image classification, language modeling as well as super-resolution. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mengye Ren;Renjie Liao;Raquel Urtasun;Fabian H. Sinz;Richard S. Zemel", "authorids": "mren@cs.toronto.edu;rjliao@cs.toronto.edu;urtasun@cs.toronto.edu;fabian.sinz@epagoge.de;zemel@cs.toronto.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nren2017normalizing,\ntitle={Normalizing the Normalizers: Comparing and Extending Network Normalization Schemes},\nauthor={Mengye Ren and Renjie Liao and Raquel Urtasun and Fabian H. Sinz and Richard S. Zemel},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rk5upnsxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=rk5upnsxe", "pdf_size": 0, "rating": "5;7;9", "confidence": "4;4;5", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10599721576218110062&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "rk9eAFcxg", "title": "Variational Recurrent Adversarial Deep Domain Adaptation", "track": "main", "status": "Poster", "tldr": "We propose Variational Recurrent Adversarial Deep Domain Adaptation approach to capture and transfer temporal latent dependencies in multivariate time-series data", "abstract": "We study the problem of learning domain invariant representations for time series data while transferring the complex temporal latent dependencies between the domains. Our model termed as Variational Recurrent Adversarial Deep Domain Adaptation (VRADA) is built atop a variational recurrent neural network (VRNN) and trains adversarially to capture complex temporal relationships that are domain-invariant. This is (as far as we know) the first to capture and transfer temporal latent dependencies in multivariate time-series data. Through experiments on real-world multivariate healthcare time-series datasets, we empirically demonstrate that learning temporal dependencies helps our model's ability to create domain-invariant representations, allowing our model to outperform current state-of-the-art deep domain adaptation approaches.", "keywords": "Deep learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Sanjay Purushotham;Wilka Carvalho;Tanachat Nilanon;Yan Liu", "authorids": "spurusho@usc.edu;wcarvalh@usc.edu;nilanon@usc.edu;yanliu.cs@usc.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\npurushotham2017variational,\ntitle={Variational Recurrent Adversarial Deep Domain Adaptation},\nauthor={Sanjay Purushotham and Wilka Carvalho and Tanachat Nilanon and Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rk9eAFcxg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rk9eAFcxg", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 185, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17377225322670245&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "rkE3y85ee", "title": "Categorical Reparameterization with Gumbel-Softmax", "track": "main", "status": "Poster", "tldr": "Simple, differentiable sampling mechanism for categorical variables that can be trained in neural nets via standard backprop.", "abstract": "Categorical variables are a natural choice for representing discrete structure in the world. However, stochastic neural networks rarely use categorical latent variables due to the inability to backpropagate through samples. In this work, we present an efficient gradient estimator that replaces the non-differentiable sample from a categorical distribution with a differentiable sample from a novel Gumbel-Softmax distribution. This distribution has the essential property that it can be smoothly annealed into a categorical distribution. We show that our Gumbel-Softmax estimator outperforms state-of-the-art gradient estimators on structured output prediction and unsupervised generative modeling tasks with categorical latent variables, and enables large speedups on semi-supervised classification.", "keywords": "Deep learning;Semi-Supervised Learning;Optimization;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Eric Jang;Shixiang Gu;Ben Poole", "authorids": "ejang@google.com;sg717@cam.ac.uk;poole@cs.stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\njang2017categorical,\ntitle={Categorical Reparameterization with Gumbel-Softmax},\nauthor={Eric Jang and Shixiang Gu and Ben Poole},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rkE3y85ee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkE3y85ee", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;5", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 7241, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8563509432417332168&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "rkE8pVcle", "title": "Learning through Dialogue Interactions by Asking Questions", "track": "main", "status": "Poster", "tldr": "We investigate how a bot can benefit from interacting with users and asking questions.", "abstract": "A good dialogue agent should have the ability to interact with users by both responding to questions and by asking questions, and importantly to learn from both types of interactions. In this work, we explore this direction by designing a simulator and a set of synthetic tasks in the movie domain that allow such interactions between a learner and a teacher. We investigate how a learner can benefit from asking questions in both offline and online reinforcement learning settings, and demonstrate that the learner improves when asking questions. Our work represents a first step in developing such end-to-end learned interactive dialogue agents.\n", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "Jiwei Li;Alexander H. Miller;Sumit Chopra;Marc'Aurelio Ranzato;Jason Weston", "authorids": "jiwel@fb.com;ahm@fb.com;spchopra@fb.com;ranzato@fb.com;jase@fb.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nli2017learning,\ntitle={Learning through Dialogue Interactions by Asking Questions},\nauthor={Jiwei Li and Alexander H. Miller and Sumit Chopra and Marc'Aurelio Ranzato and Jason Weston},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rkE8pVcle}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkE8pVcle", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;3;5", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 25, "authors#_avg": 5, "corr_rating_confidence": 1.0, "gs_citation": 813, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18233404209420750900&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 16 }, { "id": "rkEFLFqee", "title": "Decomposing Motion and Content for Natural Video Sequence Prediction", "track": "main", "status": "Poster", "tldr": "", "abstract": "We propose a deep neural network for the prediction of future frames in natural video sequences. To effectively handle complex evolution of pixels in videos, we propose to decompose the motion and content, two key components generating dynamics in videos. Our model is built upon the Encoder-Decoder Convolutional Neural Network and Convolutional LSTM for pixel-level prediction, which independently capture the spatial layout of an image and the corresponding temporal dynamics. By independently modeling motion and content, predicting the next frame reduces to converting the extracted content features into the next frame content by the identified motion features, which simplifies the task of prediction. Our model is end-to-end trainable over multiple time steps, and naturally learns to decompose motion and content without separate training. We evaluate the pro- posed network architecture on human activity videos using KTH, Weizmann action, and UCF-101 datasets. We show state-of-the-art performance in comparison to recent approaches. To the best of our knowledge, this is the first end-to-end trainable network architecture with motion and content separation to model the spatio-temporal dynamics for pixel-level future prediction in natural videos.", "keywords": "Computer vision;Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Ruben Villegas;Jimei Yang;Seunghoon Hong;Xunyu Lin;Honglak Lee", "authorids": "rubville@umich.edu;jimyang@adobe.com;maga33@postech.ac.kr;timelin@buaa.edu.cn;honglak@umich.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nvillegas2017decomposing,\ntitle={Decomposing Motion and Content for Natural Video Sequence Prediction},\nauthor={Ruben Villegas and Jimei Yang and Seunghoon Hong and Xunyu Lin and Honglak Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rkEFLFqee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkEFLFqee", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;5", "rating_avg": 6.666666666666667, "confidence_avg": 4.333333333333333, "replies_avg": 23, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 740, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10892353807026411567&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "rkFBJv9gg", "title": "Learning Features of Music From Scratch", "track": "main", "status": "Poster", "tldr": "We introduce a new large-scale music dataset, define a multi-label classification task, and benchmark machine learning architectures on this task.", "abstract": "This paper introduces a new large-scale music dataset, MusicNet, to serve as a source \nof supervision and evaluation of machine learning methods for music research. \nMusicNet consists of hundreds of freely-licensed classical music recordings \nby 10 composers, written for 11 instruments, together with instrument/note \nannotations resulting in over 1 million temporal labels on 34 hours of chamber music\nperformances under various studio and microphone conditions. \n\nThe paper defines a multi-label classification task to predict notes in musical recordings, \nalong with an evaluation protocol, and benchmarks several machine learning architectures for this task: \ni) learning from spectrogram features; \nii) end-to-end learning with a neural net; \niii) end-to-end learning with a convolutional neural net. \nThese experiments show that end-to-end models trained for note prediction learn frequency\nselective filters as a low-level representation of audio. ", "keywords": "Applications", "primary_area": "", "supplementary_material": "", "author": "John Thickstun;Zaid Harchaoui;Sham Kakade", "authorids": "thickstn@cs.washington.edu;sham@cs.washington.edu;zaid@uw.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nthickstun2017learning,\ntitle={Learning Features of Music From Scratch},\nauthor={John Thickstun and Zaid Harchaoui and Sham Kakade},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rkFBJv9gg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkFBJv9gg", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 266, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5348435535461737615&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "rkFd2P5gl", "title": "Leveraging Asynchronicity in Gradient Descent for Scalable Deep Learning", "track": "main", "status": "Reject", "tldr": "Overlapping communication and computation for distributed gradient descent.", "abstract": "In this paper, we present multiple approaches for improving the performance of gradient descent when utilizing mutiple compute resources. The proposed approaches span a solution space ranging from equivalence to running on a single compute device to delaying gradient updates a fixed number of times. We present a new approach, asynchronous layer-wise gradient descent that maximizes overlap of layer-wise backpropagation (computation) with gradient synchronization (communication). This approach provides maximal theoretical equivalence to the de facto gradient descent algorithm, requires limited asynchronicity across multiple iterations of gradient descent, theoretically improves overall speedup, while minimizing the additional space requirements for asynchronicity. We implement all of our proposed approaches using Caffe \u2013 a high performance Deep Learning library \u2013 and evaluate it on both an Intel Sandy Bridge cluster connected with InfiniBand as well as an NVIDIA DGX-1 connected with NVLink. The evaluations are performed on a set of well known workloads including AlexNet and GoogleNet on the ImageNet dataset. Our evaluation of these neural network topologies indicates asynchronous gradient descent has a speedup of up to 1.7x compared to synchronous.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Jeff Daily;Abhinav Vishnu;Charles Siegel", "authorids": "jeff.daily@pnnl.gov;abhinav.vishnu@pnnl.gov;charles.siegel@pnnl.gov", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndaily2017leveraging,\ntitle={Leveraging Asynchronicity in Gradient Descent for Scalable Deep Learning},\nauthor={Jeff Daily and Abhinav Vishnu and Charles Siegel},\nyear={2017},\nurl={https://openreview.net/forum?id=rkFd2P5gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkFd2P5gl", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2gEj1iFL8EsJ:scholar.google.com/&scioq=Leveraging+Asynchronicity+in+Gradient+Descent+for+Scalable+Deep+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rkGabzZgl", "title": "Dropout with Expectation-linear Regularization", "track": "main", "status": "Poster", "tldr": "", "abstract": "Dropout, a simple and effective way to train deep neural networks, has led to a number of impressive empirical successes and spawned many recent theoretical investigations. However, the gap between dropout\u2019s training and inference phases, introduced due to tractability considerations, has largely remained under-appreciated. In this work, we first formulate dropout as a tractable approximation of some latent variable model, leading to a clean view of parameter sharing and enabling further theoretical analysis. Then, we introduce (approximate) expectation-linear dropout neural networks, whose inference gap we are able to formally characterize. Algorithmically, we show that our proposed measure of the inference gap can be used to regularize the standard dropout training objective, resulting in an explicit control of the gap. Our method is as simple and efficient as standard dropout. We further prove the upper bounds on the loss in accuracy due to expectation-linearization, describe classes of input distributions that expectation-linearize easily. Experiments on three image classification benchmark datasets demonstrate that reducing the inference gap can indeed improve the performance consistently.", "keywords": "Theory;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Xuezhe Ma;Yingkai Gao;Zhiting Hu;Yaoliang Yu;Yuntian Deng;Eduard Hovy", "authorids": "xuezhem@cs.cmu.edu;yingkaig@cs.cmu.edu;zhitinghu@cs.cmu.edu;yaoliang@cs.cmu.edu;dengyuntian@gmail.com;hovy@cmu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nma2017dropout,\ntitle={Dropout with Expectation-linear Regularization},\nauthor={Xuezhe Ma and Yingkai Gao and Zhiting Hu and Yaoliang Yu and Yuntian Deng and Eduard Hovy},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rkGabzZgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkGabzZgl", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;3;3", "rating_avg": 7.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13982026805268750521&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rkKCdAdgx", "title": "Compact Embedding of Binary-coded Inputs and Outputs using Bloom Filters", "track": "main", "status": "Reject", "tldr": "Bloom embeddings compactly represent sparse high-dimensional binary-coded instances without compromising accuracy", "abstract": "The size of neural network models that deal with sparse inputs and outputs is often dominated by the dimensionality of those inputs and outputs. Large models with high-dimensional inputs and outputs are difficult to train due to the limited memory of graphical processing units, and difficult to deploy on mobile devices with limited hardware. To address these difficulties, we propose Bloom embeddings, a compression technique that can be applied to the input and output of neural network models dealing with sparse high-dimensional binary-coded instances. Bloom embeddings are computationally efficient, and do not seriously compromise the accuracy of the model up to 1/5 compression ratios. In some cases, they even improve over the original accuracy, with relative increases up to 12%. We evaluate Bloom embeddings on 7 data sets and compare it against 4 alternative methods, obtaining favorable results. We also discuss a number of further advantages of Bloom embeddings, such as 'on-the-fly' constant-time operation, zero or marginal space requirements, training time speedups, or the fact that they do not require any change to the core model architecture or training configuration.", "keywords": "Applications;Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Joan Serr\u00e0;Alexandros Karatzoglou", "authorids": "joan.serra@telefonica.com;alexandros.karatzoglou@telefonica.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nserr{\\`a}2017compact,\ntitle={Compact Embedding of Binary-coded Inputs and Outputs using Bloom Filters},\nauthor={Joan Serr{\\`a} and Alexandros Karatzoglou},\nyear={2017},\nurl={https://openreview.net/forum?id=rkKCdAdgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkKCdAdgx", "pdf_size": 0, "rating": "3;6;6", "confidence": "3;4;4", "rating_avg": 5.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=566026494198497965&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rkYmiD9lg", "title": "Exponential Machines", "track": "main", "status": "Workshop", "tldr": "A supervised machine learning algorithm with a polynomial decision function (like SVM with a polynomial kernel) that models exponentially many polynomial terms by factorizing the tensor of the parameters.", "abstract": "Modeling interactions between features improves the performance of machine learning solutions in many domains (e.g. recommender systems or sentiment analysis). In this paper, we introduce Exponential Machines (ExM), a predictor that models all interactions of every order. The key idea is to represent an exponentially large tensor of parameters in a factorized format called Tensor Train (TT). The Tensor Train format regularizes the model and lets you control the number of underlying parameters. To train the model, we develop a stochastic Riemannian optimization procedure, which allows us to fit tensors with 2^160 entries. We show that the model achieves state-of-the-art performance on synthetic data with high-order interactions and that it works on par with high-order factorization machines on a recommender system dataset MovieLens 100K.", "keywords": "Supervised Learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Alexander Novikov;Mikhail Trofimov;Ivan Oseledets", "authorids": "novikov@bayesgroup.ru;mikhail.trofimov@phystech.edu;i.oseledets@skoltech.ru", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkYmiD9lg", "pdf_size": 0, "rating": "5;6;6;7", "confidence": "4;4;4;3", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": -0.816496580927726, "gs_citation": 169, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14741276431565553156&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "id": "rkaRFYcgl", "title": "Low-rank passthrough neural networks", "track": "main", "status": "Reject", "tldr": "Describe low-rank and low-rank plus diagonal parametrizations for Highway Neural Networks, GRUs and other kinds of passthrough neural networks. Present competitive experimental results.", "abstract": "Deep learning consists in training neural networks to perform computations that sequentially unfold in many steps over a time dimension or an intrinsic depth dimension. For large depths, this is usually accomplished by specialized network architectures that are designed to mitigate the vanishing gradient problem, e.g. LSTMs, GRUs, Highway Networks and Deep Residual Networks, which are based on a single structural principle: the state passthrough. We observe that these \"Passthrough Networks\" architectures enable the decoupling of the network state size from the number of parameters of the network, a possibility that is exploited in some recent works but not thoroughly explored. In this work we propose simple, yet effective, low-rank and low-rank plus diagonal matrix parametrizations for Passthrough Networks which exploit this decoupling property, reducing the data complexity and memory requirements of the network while preserving its memory capacity. We present competitive experimental results on several tasks, including a near state of the art result on sequential randomly-permuted MNIST classification, a hard task on natural data.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Antonio Valerio Miceli Barone", "authorids": "amiceli@inf.ed.ac.uk", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nbarone2017lowrank,\ntitle={Low-rank passthrough neural networks},\nauthor={Antonio Valerio Miceli Barone},\nyear={2017},\nurl={https://openreview.net/forum?id=rkaRFYcgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkaRFYcgl", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;0;4", "rating_avg": 5.0, "confidence_avg": 2.6666666666666665, "replies_avg": 21, "authors#_avg": 1, "corr_rating_confidence": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8978811649600976908&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rkjZ2Pcxe", "title": "Adding Gradient Noise Improves Learning for Very Deep Networks", "track": "main", "status": "Reject", "tldr": "Adding annealed Gaussian noise to the gradient improves training of neural networks in ways complementary to adaptive learning algorithms and the noise introduced by SGD.", "abstract": "Deep feedforward and recurrent networks have achieved impressive results in many perception and language processing applications. Recently, more complex architectures such as Neural Turing Machines and Memory Networks have been proposed for tasks including question answering and general computation, creating a new set of optimization challenges. In this paper, we explore the low-overhead and easy-to-implement optimization technique of adding annealed Gaussian noise to the gradient, which we find surprisingly effective when training these very deep architectures. Unlike classical weight noise, gradient noise injection is complementary to advanced stochastic optimization algorithms such as Adam and AdaGrad. The technique not only helps to avoid overfitting, but also can result in lower training loss. We see consistent improvements in performance across an array of complex models, including state-of-the-art deep networks for question answering and algorithm learning. We observe that this optimization strategy allows a fully-connected 20-layer deep network to escape a bad initialization with standard stochastic gradient descent. We encourage further application of this technique to additional modern neural architectures.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arvind Neelakantan;Luke Vilnis;Quoc V. Le;Lukasz Kaiser;Karol Kurach;Ilya Sutskever;James Martens", "authorids": "arvind@cs.umass.edu;luke@cs.umass.edu;qvl@google.com;lukaszkaiser@google.com;kkurach@google.com;ilyasu@openai.com;jmartens@cs.toronto.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nneelakantan2017adding,\ntitle={Adding Gradient Noise Improves Learning for Very Deep Networks},\nauthor={Arvind Neelakantan and Luke Vilnis and Quoc V. Le and Lukasz Kaiser and Karol Kurach and Ilya Sutskever and James Martens},\nyear={2017},\nurl={https://openreview.net/forum?id=rkjZ2Pcxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkjZ2Pcxe", "pdf_size": 0, "rating": "4;4;7", "confidence": "5;4;5", "rating_avg": 5.0, "confidence_avg": 4.666666666666667, "replies_avg": 11, "authors#_avg": 7, "corr_rating_confidence": 0.5, "gs_citation": 640, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2222296730320517544&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rkmDI85ge", "title": "Efficient Softmax Approximation for GPUs", "track": "main", "status": "Workshop", "tldr": "", "abstract": "We propose an approximate strategy to efficiently train neural network based language models over very large vocabularies. Our approach, called adaptive softmax, circumvents the linear dependency on the vocabulary size by exploiting the unbalanced word distribution to form clusters that explicitly minimize the expectation of computational complexity. Our approach further reduces the computational cost by exploiting the specificities of modern architectures and matrix-matrix vector operations, making it particularly suited for graphical processing units. Our experiments carried out on standard benchmarks, such as EuroParl and One Billion Word, show that our approach brings a large gain in efficiency over standard approximations while achieving an accuracy close to that of the full softmax.", "keywords": "Natural language processing", "primary_area": "", "supplementary_material": "", "author": "\u00c9douard Grave;Armand Joulin;Moustapha Ciss\u00e9;David Grangier;Herv\u00e9 J\u00e9gou", "authorids": "egrave@fb.com;ajoulin@fb.com;moustaphacisse@fb.com;grangier@fb.com;rvj@fb.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=rkmDI85ge", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;3", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 348, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1046983305372158167&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "rkpACe1lx", "title": "HyperNetworks", "track": "main", "status": "Poster", "tldr": "We train a small RNN to generate weights for a larger RNN, and train the system end-to-end. We obtain state-of-the-art results on a variety of sequence modelling tasks.", "abstract": "This work explores hypernetworks: an approach of using one network, also known as a hypernetwork, to generate the weights for another network. We apply hypernetworks to generate adaptive weights for recurrent networks. In this case, hypernetworks can be viewed as a relaxed form of weight-sharing across layers. In our implementation, hypernetworks are are trained jointly with the main network in an end-to-end fashion. Our main result is that hypernetworks can generate non-shared weights for LSTM and achieve state-of-the-art results on a variety of sequence modelling tasks including character-level language modelling, handwriting generation and neural machine translation, challenging the weight-sharing paradigm for recurrent networks.", "keywords": "Natural language processing;Deep learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "David Ha;Andrew M. Dai;Quoc V. Le", "authorids": "hadavid@google.com;adai@google.com;qvl@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nha2017hypernetworks,\ntitle={HyperNetworks},\nauthor={David Ha and Andrew M. Dai and Quoc V. Le},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=rkpACe1lx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=rkpACe1lx", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;4;4", "rating_avg": 7.0, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1979, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3668354792918495805&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 9 }, { "id": "rkpdnIqlx", "title": "The Variational Walkback Algorithm", "track": "main", "status": "Reject", "tldr": "A new algorithm for training undirected graphical models.", "abstract": "A recognized obstacle to training undirected graphical models with latent variables such as Boltzmann machines is that the maximum likelihood training procedure requires sampling from Monte-Carlo Markov chains which may not mix well, in the inner loop of training, for each example. We first propose the idea that it is sufficient to locally carve the energy function everywhere so that its gradient points in the \"right\" direction (i.e., towards generating the data). Following on previous work on contrastive divergence, denoising autoencoders, generative stochastic networks and unsupervised learning using non-equilibrium dynamics, we propose a variational bound on the marginal log-likelihood of the data which corresponds to a new learning procedure that first walks away from data points by following the model transition operator and then trains that operator to walk backwards for each of these steps, back towards the training example. The tightness of the variational bound relies on gradually increasing temperature as we walk away from the data, at each step providing a gradient on the parameters to maximize the probability that the transition operator returns to its previous state. Interestingly, this algorithm admits a variant where there is no explicit energy function, i.e., the parameters are used to directly define the transition operator. This also eliminates the explicit need for symmetric weights which previous Boltzmann machine or Hopfield net models require, and which makes these models less biologically plausible.", "keywords": "Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Anirudh Goyal;Nan Rosemary Ke;Alex Lamb;Yoshua Bengio", "authorids": "anirudhgoyal9119@gmail.com;rosemary.nan.ke@gmail.com;lambalex@iro.umontreal.ca;yoshua.umontreal@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngoyal2017the,\ntitle={The Variational Walkback Algorithm},\nauthor={Anirudh Goyal and Nan Rosemary Ke and Alex Lamb and Yoshua Bengio},\nyear={2017},\nurl={https://openreview.net/forum?id=rkpdnIqlx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkpdnIqlx", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;5", "rating_avg": 4.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10017708629402184139&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rksfwnFxl", "title": "LSTM-Based System-Call Language Modeling and Ensemble Method for Host-Based Intrusion Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "In computer security, designing a robust intrusion detection system is one of the most fundamental and important problems. In this paper, we propose a system-call language-modeling approach for designing anomaly-based host intrusion detection systems. To remedy the issue of high false-alarm rates commonly arising in conventional methods, we employ a novel ensemble method that blends multiple thresholding classifiers into a single one, making it possible to accumulate `highly normal' sequences. The proposed system-call language model has various advantages leveraged by the fact that it can learn the semantic meaning and interactions of each system call that existing methods cannot effectively consider. Through diverse experiments on public benchmark datasets, we demonstrate the validity and effectiveness of the proposed method. Moreover, we show that our model possesses high portability, which is one of the key aspects of realizing successful intrusion detection systems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gyuwan Kim;Hayoon Yi;Jangho Lee;Yunheung Paek;Sungroh Yoon", "authorids": "kgwmath@snu.ac.kr;hyyi@snu.ac.kr;ubuntu@snu.ac.kr;ypaek@snu.ac.kr;sryoon@snu.ac.kr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkim2017lstmbased,\ntitle={{LSTM}-Based System-Call Language Modeling and Ensemble Method for Host-Based Intrusion Detection},\nauthor={Gyuwan Kim and Hayoon Yi and Jangho Lee and Yunheung Paek and Sungroh Yoon},\nyear={2017},\nurl={https://openreview.net/forum?id=rksfwnFxl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rksfwnFxl", "pdf_size": 0, "rating": "5;5;8", "confidence": "3;4;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": -0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16114622809270900231&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rkuDV6iex", "title": "An Empirical Analysis of Deep Network Loss Surfaces", "track": "main", "status": "Reject", "tldr": "Analyzing the loss surface of deep neural network trained with different optimization methods", "abstract": "The training of deep neural networks is a high-dimension optimization problem with respect to the loss function of a model. Unfortunately, these functions are of high dimension and non-convex and hence difficult to characterize. In this paper, we empirically investigate the geometry of the loss functions for state-of-the-art networks with multiple stochastic optimization methods. We do this through several experiments that are visualized on polygons to understand how and when these stochastic optimization methods find minima.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Daniel Jiwoong Im;Michael Tao;Kristin Branson", "authorids": "daniel.im@aifounded.com;mtao@dgp.toronto.edu;bransonk@janelia.hhmi.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nim2017an,\ntitle={An Empirical Analysis of Deep Network Loss Surfaces},\nauthor={Daniel Jiwoong Im and Michael Tao and Kristin Branson},\nyear={2017},\nurl={https://openreview.net/forum?id=rkuDV6iex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkuDV6iex", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;4", "rating_avg": 4.666666666666667, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10967426100898927232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rky3QW9le", "title": "Transformational Sparse Coding", "track": "main", "status": "Reject", "tldr": "We extend sparse coding to include general affine transformations. We present a novel technical approach to circumvent inference intractability.", "abstract": "\nA fundamental problem faced by object recognition systems is that\nobjects and their features can appear in different locations, scales\nand orientations. Current deep learning methods attempt to achieve\ninvariance to local translations via pooling, discarding the locations\nof features in the process. Other approaches explicitly learn\ntransformed versions of the same feature, leading to representations\nthat quickly explode in size. Instead of discarding the rich and\nuseful information about feature transformations to achieve\ninvariance, we argue that models should learn object features\nconjointly with their transformations to achieve equivariance. We\npropose a new model of unsupervised learning based on sparse coding\nthat can learn object features jointly with their affine\ntransformations directly from images. Results based on learning from\nnatural images indicate that our approach\nmatches the reconstruction quality of traditional sparse coding but\nwith significantly fewer degrees of freedom while simultaneously\nlearning transformations from data. These results open the door to\nscaling up unsupervised learning to allow deep feature+transformation\nlearning in a manner consistent with the ventral+dorsal stream\narchitecture of the primate visual cortex.", "keywords": "Unsupervised Learning;Computer vision;Optimization", "primary_area": "", "supplementary_material": "", "author": "Dimitrios C. Gklezakos;Rajesh P. N. Rao", "authorids": "gklezd@cs.washington.edu;rao@cs.washington.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngklezakos2017transformational,\ntitle={Transformational Sparse Coding},\nauthor={Dimitrios C. Gklezakos and Rajesh P. N. Rao},\nyear={2017},\nurl={https://openreview.net/forum?id=rky3QW9le}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rky3QW9le", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 18, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16347366713493878575&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "ry18Ww5ee", "title": "Hyperband: Bandit-Based Configuration Evaluation for Hyperparameter Optimization", "track": "main", "status": "Poster", "tldr": "", "abstract": "Performance of machine learning algorithms depends critically on identifying a good set of hyperparameters. While recent approaches use Bayesian Optimization to adaptively select configurations, we focus on speeding up random search through adaptive resource allocation. We present Hyperband, a novel algorithm for hyperparameter optimization that is simple, flexible, and theoretically sound. Hyperband is a principled early-stoppping method that adaptively allocates a predefined resource, e.g., iterations, data samples or number of features, to randomly sampled configurations. We compare Hyperband with state-of-the-art Bayesian Optimization methods on several hyperparameter optimization problems. We observe that Hyperband can provide over an order of magnitude speedups over competitors on a variety of neural network and kernel-based learning problems. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lisha Li;Kevin Jamieson;Giulia DeSalvo;Afshin Rostamizadeh;Ameet Talwalkar", "authorids": "lishal@cs.ucla.edu;kjamieson@berkeley.edu;desalvo@cims.nyu.edu;rostami@google.com;ameet@cs.ucla.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nli2017hyperband,\ntitle={Hyperband: Bandit-Based Configuration Evaluation for Hyperparameter Optimization},\nauthor={Lisha Li and Kevin Jamieson and Giulia DeSalvo and Afshin Rostamizadeh and Ameet Talwalkar},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ry18Ww5ee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ry18Ww5ee", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 17, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 196, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2173316299848311968&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ry2YOrcge", "title": "Learning a Natural Language Interface with Neural Programmer", "track": "main", "status": "Poster", "tldr": "To our knowledge, this paper presents the first weakly supervised, end-to-end neural network model to induce programs on a real-world dataset.", "abstract": "Learning a natural language interface for database tables is a challenging task that involves deep language understanding and multi-step reasoning. The task is often approached by mapping natural language queries to logical forms or programs that provide the desired response when executed on the database. To our knowledge, this paper presents the first weakly supervised, end-to-end neural network model to induce such programs on a real-world dataset. We enhance the objective function of Neural Programmer, a neural network with built-in discrete operations, and apply it on WikiTableQuestions, a natural language question-answering dataset. The model is trained end-to-end with weak supervision of question-answer pairs, and does not require domain-specific grammars, rules, or annotations that are key elements in previous approaches to program induction. The main experimental result in this paper is that a single Neural Programmer model achieves 34.2% accuracy using only 10,000 examples with weak supervision. An ensemble of 15 models, with a trivial combination technique, achieves 37.7% accuracy, which is competitive to the current state-of-the-art accuracy of 37.1% obtained by a traditional natural language semantic parser.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Arvind Neelakantan;Quoc V. Le;Martin Abadi;Andrew McCallum;Dario Amodei", "authorids": "arvind@cs.umass.edu;qvl@google.com;abadi@google.com;mccallum@cs.umass.edu;damodei@openai.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nneelakantan2017learning,\ntitle={Learning a Natural Language Interface with Neural Programmer},\nauthor={Arvind Neelakantan and Quoc V. Le and Martin Abadi and Andrew McCallum and Dario Amodei},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ry2YOrcge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer4", "site": "https://openreview.net/forum?id=ry2YOrcge", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "rating_avg": 6.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15635397884771830147&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ry3iBFqgl", "title": "NEWSQA: A MACHINE COMPREHENSION DATASET", "track": "main", "status": "Reject", "tldr": "Crowdsourced QA dataset with natural language questions and multi-word answers", "abstract": "We present NewsQA, a challenging machine comprehension dataset of over 100,000 question-answer pairs. Crowdworkers supply questions and answers based on a set of over 10,000 news articles from CNN, with answers consisting in spans of text from the corresponding articles. We collect this dataset through a four- stage process designed to solicit exploratory questions that require reasoning. A thorough analysis confirms that NewsQA demands abilities beyond simple word matching and recognizing entailment. We measure human performance on the dataset and compare it to several strong neural models. The performance gap between humans and machines (25.3% F1) indicates that significant progress can be made on NewsQA through future research. The dataset is freely available at datasets.maluuba.com/NewsQA.", "keywords": "Natural language processing;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Adam Trischler;Tong Wang;Xingdi Yuan;Justin Harris;Alessandro Sordoni;Philip Bachman;Kaheer Suleman", "authorids": "adam.trischler@maluuba.com;tong.wang@maluuba.com;eric.yuan@maluuba.com;justin.harris@maluuba.com;alessandro.sordoni@maluuba.com;phil.bachman@maluuba.com;k.suleman@maluuba.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\ntrischler2017newsqa,\ntitle={{NEWSQA}: A {MACHINE} {COMPREHENSION} {DATASET}},\nauthor={Adam Trischler and Tong Wang and Xingdi Yuan and Justin Harris and Alessandro Sordoni and Philip Bachman and Kaheer Suleman},\nyear={2017},\nurl={https://openreview.net/forum?id=ry3iBFqgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ry3iBFqgl", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 1003, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2131610341122918040&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "ry4Vrt5gl", "title": "Learning to Optimize", "track": "main", "status": "Poster", "tldr": "We explore learning an optimization algorithm automatically. ", "abstract": "Algorithm design is a laborious process and often requires many iterations of ideation and validation. In this paper, we explore automating algorithm design and present a method to learn an optimization algorithm. We approach this problem from a reinforcement learning perspective and represent any particular optimization algorithm as a policy. We learn an optimization algorithm using guided policy search and demonstrate that the resulting algorithm outperforms existing hand-engineered algorithms in terms of convergence speed and/or the final objective value. ", "keywords": "Reinforcement Learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Ke Li;Jitendra Malik", "authorids": "ke.li@eecs.berkeley.edu;malik@eecs.berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nli2017learning,\ntitle={Learning to Optimize},\nauthor={Ke Li and Jitendra Malik},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ry4Vrt5gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ry4Vrt5gl", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "rating_avg": 6.666666666666667, "confidence_avg": 4.0, "replies_avg": 20, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16488737098711868878&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 17 }, { "id": "ry54RWtxx", "title": "Learning a Static Analyzer: A Case Study on a Toy Language", "track": "main", "status": "Reject", "tldr": "", "abstract": "Static analyzers are meta-programs that analyze programs to detect\n potential errors or collect information. For example, they are used\n as security tools to detect potential buffer overflows. Also, they\n are used by compilers to verify that a program is well-formed and\n collect information to generate better code. In this paper, we\n address the following question: can a static analyzer be learned\n from data? More specifically, can we use deep learning to learn a\n static analyzer without the need for complicated feature\n engineering? We show that long short-term memory networks are able\n to learn a basic static analyzer for a simple toy language. However,\n pre-existing approaches based on feature engineering, hidden Markov\n models, or basic recurrent neural networks fail on such a simple\n problem. Finally, we show how to make such a tool usable by\n employing a language model to help the programmer detect where the\n reported errors are located.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Manzil Zaheer;Jean-Baptiste Tristan;Michael L. Wick;Guy L. Steele Jr.", "authorids": "manzil.zaheer@cmu.edu;jean.baptiste.tristan@oracle.com;michael.wick@oracle.com;guy.steele@oracle.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzaheer2017learning,\ntitle={Learning a Static Analyzer: A Case Study on a Toy Language},\nauthor={Manzil Zaheer and Jean-Baptiste Tristan and Michael L. Wick and Guy L. Steele Jr.},\nyear={2017},\nurl={https://openreview.net/forum?id=ry54RWtxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ry54RWtxx", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9404629276128608631&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "ry7O1ssex", "title": "Generative Adversarial Networks as Variational Training of Energy Based Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we study deep generative models for effective unsupervised learning. We propose VGAN, which works by minimizing a variational lower bound of the negative log likelihood (NLL) of an energy based model (EBM), where the model density $p(\\mathbf{x})$ is approximated by a variational distribution $q(\\mathbf{x})$ that is easy to sample from. The training of VGAN takes a two step procedure: given $p(\\mathbf{x})$, $q(\\mathbf{x})$ is updated to maximize the lower bound; $p(\\mathbf{x})$ is then updated one step with samples drawn from $q(\\mathbf{x})$ to decrease the lower bound. VGAN is inspired by the generative adversarial networks (GANs), where $p(\\mathbf{x})$ corresponds to the discriminator and $q(\\mathbf{x})$ corresponds to the generator, but with several notable differences. We hence name our model variational GANs (VGANs). VGAN provides a practical solution to training deep EBMs in high dimensional space, by eliminating the need of MCMC sampling. From this view, we are also able to identify causes to the difficulty of training GANs and propose viable solutions.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shuangfei Zhai;Yu Cheng;Rogerio Feris;Zhongfei Zhang", "authorids": "szhai2@binghamton.edu;chengyu@us.ibm.com;rsferis@us.ibm.com;zhongfei@cs.binghamton.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhai2017generative,\ntitle={Generative Adversarial Networks as Variational Training of Energy Based Models},\nauthor={Shuangfei Zhai and Yu Cheng and Rogerio Feris and Zhongfei Zhang},\nyear={2017},\nurl={https://openreview.net/forum?id=ry7O1ssex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ry7O1ssex", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;5;5", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 17, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11474665754575506716&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ryAe2WBee", "title": "Multi-label learning with semantic embeddings", "track": "main", "status": "Reject", "tldr": "The SEM approach to multi-label learning models labels using multinomial distributions parametrized by nonlinear functions of the instance features, is scalable and outperforms current state-of-the-art algorithms", "abstract": "Multi-label learning aims to automatically assign to an instance (e.g., an image or a document) the most relevant subset of labels from a large set of possible labels. The main challenge is to maintain accurate predictions while scaling efficiently on data sets with extremely large label sets and many training data points. We propose a simple but effective neural net approach, the Semantic Embedding Model (SEM), that models the labels for an instance as draws from a multinomial distribution parametrized by nonlinear functions of the instance features. A Gauss-Siedel mini-batch adaptive gradient descent algorithm is used to fit the model. To handle extremely large label sets, we propose and experimentally validate the efficacy of fitting randomly chosen marginal label distributions. Experimental results on eight real-world data sets show that SEM garners significant performance gains over existing methods. In particular, we compare SEM to four recent state-of-the-art algorithms (NNML, BMLPL, REmbed, and SLEEC) and find that SEM uniformly outperforms these algorithms in several widely used evaluation metrics while requiring significantly less training time.\n", "keywords": "Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Liping Jing;MiaoMiao Cheng;Liu Yang;Alex Gittens;Michael W. Mahoney", "authorids": "lpjing@bjtu.edu.cn;15112085@bjtu.edu.cn;11112191@bjtu.edu.cn;gittens@icsi.berkeley.edu;mmahoney@stat.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\njing2017multilabel,\ntitle={Multi-label learning with semantic embeddings},\nauthor={Liping Jing and MiaoMiao Cheng and Liu Yang and Alex Gittens and Michael W. Mahoney},\nyear={2017},\nurl={https://openreview.net/forum?id=ryAe2WBee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryAe2WBee", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "rating_avg": 4.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lMzb0bLZMwUJ:scholar.google.com/&scioq=Multi-label+learning+with+semantic+embeddings&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "ryCcJaqgl", "title": "TreNet: Hybrid Neural Networks for Learning the Local Trend in Time Series", "track": "main", "status": "Reject", "tldr": "", "abstract": "Local trends of time series characterize the intermediate upward and downward patterns of time series. Learning and forecasting the local trend in time series data play an important role in many real applications, ranging from investing in the stock market, resource allocation in data centers and load schedule in smart grid. Inspired by the recent successes of neural networks, in this paper we propose TreNet, a novel end-to-end hybrid neural network that predicts the local trend of time series based on local and global contextual features. TreNet leverages convolutional neural networks (CNNs) to extract salient features from local raw data of time series. Meanwhile, considering long-range dependencies existing in the sequence of historical local trends, TreNet uses a long-short term memory recurrent neural network (LSTM) to capture such dependency. Furthermore, for predicting the local trend, a feature fusion layer is designed in TreNet to learn joint representation from the features captured by CNN and LSTM. Our proposed TreNet demonstrates its effectiveness by outperforming conventional CNN, LSTM, HMM method and various kernel based baselines on real datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tao Lin;Tian Guo;Karl Aberer", "authorids": "tao.lin@epfl.ch;tian.guo@epfl.ch;karl.aberer@epfl.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlin2017trenet,\ntitle={TreNet: Hybrid Neural Networks for Learning the Local Trend in Time Series},\nauthor={Tao Lin and Tian Guo and Karl Aberer},\nyear={2017},\nurl={https://openreview.net/forum?id=ryCcJaqgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryCcJaqgl", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "rating_avg": 5.0, "confidence_avg": 4.333333333333333, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:w4Whgj5JiqIJ:scholar.google.com/&scioq=TreNet:+Hybrid+Neural+Networks+for+Learning+the+Local+Trend+in+Time+Series&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "ryEGFD9gl", "title": "Submodular Sum-product Networks for Scene Understanding", "track": "main", "status": "Reject", "tldr": "A novel extension of sum-product networks that incorporates submodular Markov random fields into the sum nodes, resulting in a highly expressive class of models in which efficient inference is still possible.", "abstract": "Sum-product networks (SPNs) are an expressive class of deep probabilistic models in which inference takes time linear in their size, enabling them to be learned effectively. However, for certain challenging problems, such as scene understanding, the corresponding SPN has exponential size and is thus intractable. In this work, we introduce submodular sum-product networks (SSPNs), an extension of SPNs in which sum-node weights are defined by a submodular energy function. SSPNs combine the expressivity and depth of SPNs with the ability to efficiently compute the MAP state of a combinatorial number of labelings afforded by submodular energies. SSPNs for scene understanding can be understood as representing all possible parses of an image over arbitrary region shapes with respect to an image grammar. Despite this complexity, we develop an efficient and convergent algorithm based on graph cuts for computing the (approximate) MAP state of an SSPN, greatly increasing the expressivity of the SPN model class. Empirically, we show exponential improvements in parsing time compared to traditional inference algorithms such as alpha-expansion and belief propagation, while returning comparable minima.\n", "keywords": "Computer vision;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Abram L. Friesen;Pedro Domingos", "authorids": "afriesen@cs.washington.edu;pedrod@cs.washington.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nfriesen2017submodular,\ntitle={Submodular Sum-product Networks for Scene Understanding},\nauthor={Abram L. Friesen and Pedro Domingos},\nyear={2017},\nurl={https://openreview.net/forum?id=ryEGFD9gl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryEGFD9gl", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;3;4", "rating_avg": 4.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5KmMbPPnRCQJ:scholar.google.com/&scioq=Submodular+Sum-product+Networks+for+Scene+Understanding&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "ryF7rTqgl", "title": "Understanding intermediate layers using linear classifier probes", "track": "main", "status": "Reject", "tldr": "New useful concept of information to understand deep learning.", "abstract": "Neural network models have a reputation for being black boxes. We propose a new method to better understand the roles and dynamics of the intermediate layers. This has direct consequences on the design of such models and it enables the expert to be able to justify certain heuristics (such as adding auxiliary losses in middle layers). Our method uses linear classifiers, referred to as ``probes'', where a probe can only use the hidden units of a given intermediate layer as discriminating features. Moreover, these probes cannot affect the training phase of a model, and they are generally added after training. They allow the user to visualize the state of the model at multiple steps of training. We demonstrate how this can be used to develop a better intuition about models and to diagnose potential problems.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Guillaume Alain;Yoshua Bengio", "authorids": "guillaume.alain.umontreal@gmail.com;yoshua.bengio@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nalain2017understanding,\ntitle={Understanding intermediate layers using linear classifier probes},\nauthor={Guillaume Alain and Yoshua Bengio},\nyear={2017},\nurl={https://openreview.net/forum?id=ryF7rTqgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryF7rTqgl", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1030, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14298525025703106025&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "ryHlUtqge", "title": "Generalizing Skills with Semi-Supervised Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "We propose an algorithm for generalizing a deep neural network policy using \"unlabeled\" experience collected in MDPs where rewards are not available.", "abstract": "Deep reinforcement learning (RL) can acquire complex behaviors from low-level inputs, such as images. However, real-world applications of such methods require generalizing to the vast variability of the real world. Deep networks are known to achieve remarkable generalization when provided with massive amounts of labeled data, but can we provide this breadth of experience to an RL agent, such as a robot? The robot might continuously learn as it explores the world around it, even while it is deployed and performing useful tasks. However, this learning requires access to a reward function, to tell the agent whether it is succeeding or failing at its task. Such reward functions are often hard to measure in the real world, especially in domains such as robotics and dialog systems, where the reward could depend on the unknown positions of objects or the emotional state of the user. On the other hand, it is often quite practical to provide the agent with reward functions in a limited set of situations, such as when a human supervisor is present, or in a controlled laboratory setting. Can we make use of this limited supervision, and still benefit from the breadth of experience an agent might collect in the unstructured real world? In this paper, we formalize this problem setting as semi-supervised reinforcement learning (SSRL), where the reward function can only be evaluated in a set of \u201clabeled\u201d MDPs, and the agent must generalize its behavior to the wide range of states it might encounter in a set of \u201cunlabeled\u201d MDPs, by using experience from both settings. Our proposed method infers the task objective in the unlabeled MDPs through an algorithm that resembles inverse RL, using the agent\u2019s own prior experience in the labeled MDPs as a kind of demonstration of optimal behavior. We evaluate our method on challenging, continuous control tasks that require control directly from images, and show that our approach can improve the generalization of a learned deep neural network policy by using experience for which no reward function is available. We also show that our method outperforms direct supervised learning of the reward.", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Chelsea Finn;Tianhe Yu;Justin Fu;Pieter Abbeel;Sergey Levine", "authorids": "cbfinn@eecs.berkeley.edu;tianhe.yu@berkeley.edu;justinfu@eecs.berkeley.edu;pabbeel@eecs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nfinn2017generalizing,\ntitle={Generalizing Skills with Semi-Supervised Reinforcement Learning},\nauthor={Chelsea Finn and Tianhe Yu and Justin Fu and Pieter Abbeel and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ryHlUtqge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=ryHlUtqge", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;3;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": -0.5, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3176455604373641836&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "ryMxXPFex", "title": "Discrete Variational Autoencoders", "track": "main", "status": "Poster", "tldr": "We present a novel method to train a class of probabilistic models with discrete latent variables using the variational autoencoder framework, including backpropagation through the discrete latent variables.", "abstract": "Probabilistic models with discrete latent variables naturally capture datasets composed of discrete classes. However, they are difficult to train efficiently, since backpropagation through discrete variables is generally not possible. We present a novel method to train a class of probabilistic models with discrete latent variables using the variational autoencoder framework, including backpropagation through the discrete latent variables. The associated class of probabilistic models comprises an undirected discrete component and a directed hierarchical continuous component. The discrete component captures the distribution over the disconnected smooth manifolds induced by the continuous component. As a result, this class of models efficiently learns both the class of objects in an image, and their specific realization in pixels, from unsupervised data; and outperforms state-of-the-art methods on the permutation-invariant MNIST, Omniglot, and Caltech-101 Silhouettes datasets.", "keywords": "Deep learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Jason Tyler Rolfe", "authorids": "jrolfe@dwavesys.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nrolfe2017discrete,\ntitle={Discrete Variational Autoencoders},\nauthor={Jason Tyler Rolfe},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ryMxXPFex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryMxXPFex", "pdf_size": 0, "rating": "8;8;9", "confidence": "4;2;4", "rating_avg": 8.333333333333334, "confidence_avg": 3.3333333333333335, "replies_avg": 23, "authors#_avg": 1, "corr_rating_confidence": 0.5000000000000001, "gs_citation": 327, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13951733257052467330&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "ryPx38qge", "title": "A hybrid network: Scattering and Convnet", "track": "main", "status": "Reject", "tldr": "This paper shows how, by combining prior and supervised representations, one can create architectures that lead to nearly state-of-the-art results on standard benchmarks.", "abstract": "This paper shows how, by combining prior and supervised representations, one can create architectures that lead to nearly state-of-the-art results on standard benchmarks, which mean they perform as well as a deep network learned from scratch. We use scattering as a generic and fixed initialization of the first layers of a deep network, and learn the remaining layers in a supervised manner. We numerically demonstrate that deep hybrid scattering networks generalize better on small datasets than supervised deep networks. Scattering networks could help current systems to save computation time, while guaranteeing the stability to geometric transformations and noise of the first internal layers. We also show that the learned operators explicitly build invariances to geometrical variabilities, such as local rotation and translation, by analyzing the third layer of our architecture. We demonstrate that it is possible to replace the scattering transform by a standard deep network at the cost of having to learn more parameters and potentially adding instabilities. Finally, we release a new software, ScatWave, using GPUs for fast computations of a scattering network that is integrated in Torch. We evaluate our model on the CIFAR10, CIFAR100 and STL10 datasets.", "keywords": "Computer vision;Unsupervised Learning;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Edouard Oyallon", "authorids": "edouard.oyallon@ens.fr", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\noyallon2017a,\ntitle={A hybrid network: Scattering and Convnet},\nauthor={Edouard Oyallon},\nyear={2017},\nurl={https://openreview.net/forum?id=ryPx38qge}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryPx38qge", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;3;4", "rating_avg": 6.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 17, "authors#_avg": 1, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=307048808460401689&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ryQbbFile", "title": "CAN AI GENERATE LOVE ADVICE?: TOWARD NEURAL ANSWER GENERATION FOR NON-FACTOID QUESTIONS", "track": "main", "status": "Reject", "tldr": "", "abstract": "\nDeep learning methods that extract answers for non-factoid questions from QA sites are seen as critical since they can assist users in reaching their next decisions through conversations with AI systems. The current methods, however, have the following two problems: (1) They can not understand the ambiguous use of words in the questions as word usage can strongly depend on the context (e.g. the word \u201crelationship\u201d has quite different meanings in the categories of Love advice and other categories). As a result, the accuracies of their answer selections are not good enough. (2) The current methods can only select from among the answers held by QA sites and can not generate new ones. Thus, they can not answer the questions that are somewhat different with those stored in QA sites. Our solution, Neural Answer Construction Model, tackles these problems as it: (1) Incorporates the biases of semantics behind questions (e.g. categories assigned to questions) into word embeddings while also computing them regardless of the semantics. As a result, it can extract answers that suit the contexts of words used in the question as well as following the common usage of words across semantics. This improves the accuracy of answer selection. (2) Uses biLSTM to compute the embeddings of questions as well as those of the sentences often used to form answers (e.g. sentences representing conclusions or those supplementing the conclusions). It then simultaneously learns the optimum combination of those sentences as well as the closeness between the question and those sentences. As a result, our model can construct an answer that corresponds to the situation that underlies the question; it fills the gap between answer selection and generation and is the first model to move beyond the current simple answer selection model for non-factoid QAs. Evaluations using datasets created for love advice stored in the Japanese QA site, Oshiete goo, indicate that our model achieves 20 % higher accuracy in answer creation than the strong baselines. Our model is practical and has already been applied to the love advice service in Oshiete goo.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Makoto Nakatsuji;Hisashi Ito;Naruhiro Ikeda;Shota Sagara;Akihisa Fujita", "authorids": "nakatuji@nttr.co.jp;h-ito@nttr.co.jp;nikeda@nttr.co.jp;s-sagara@nttr.co.jp;akihisa@nttr.co.jp", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nnakatsuji2017can,\ntitle={{CAN} {AI} {GENERATE} {LOVE} {ADVICE}?: {TOWARD} {NEURAL} {ANSWER} {GENERATION} {FOR} {NON}-{FACTOID} {QUESTIONS}},\nauthor={Makoto Nakatsuji and Hisashi Ito and Naruhiro Ikeda and Shota Sagara and Akihisa Fujita},\nyear={2017},\nurl={https://openreview.net/forum?id=ryQbbFile}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryQbbFile", "pdf_size": 0, "rating": "4;4;4", "confidence": "0;4;4", "rating_avg": 4.0, "confidence_avg": 2.6666666666666665, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5618194990532273394&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "ryT4pvqll", "title": "Improving Policy Gradient by Exploring Under-appreciated Rewards", "track": "main", "status": "Poster", "tldr": "We present a novel form of policy gradient for model-free reinforcement learning with improved exploration properties.", "abstract": "This paper presents a novel form of policy gradient for model-free reinforcement learning (RL) with improved exploration properties. Current policy-based methods use entropy regularization to encourage undirected exploration of the reward landscape, which is ineffective in high dimensional spaces with sparse rewards. We propose a more directed exploration strategy that promotes exploration of under-appreciated reward regions. An action sequence is considered under-appreciated if its log-probability under the current policy under-estimates its resulting reward. The proposed exploration strategy is easy to implement, requiring only small modifications to the standard REINFORCE algorithm. We evaluate the approach on a set of algorithmic tasks that have long challenged RL methods. We find that our approach reduces hyper-parameter sensitivity and demonstrates significant improvements over baseline methods. Notably, the approach is able to solve a benchmark multi-digit addition task. To our knowledge, this is the first time that a pure RL method has solved addition using only reward feedback.", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Ofir Nachum;Mohammad Norouzi;Dale Schuurmans", "authorids": "ofirnachum@google.com;mnorouzi@google.com;schuurmans@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nnachum2017improving,\ntitle={Improving Policy Gradient by Exploring Under-appreciated Rewards},\nauthor={Ofir Nachum and Mohammad Norouzi and Dale Schuurmans},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ryT4pvqll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryT4pvqll", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 19, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13542244687141849688&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "ryT9R3Yxe", "title": "Generative Paragraph Vector", "track": "main", "status": "Reject", "tldr": "With a complete generative process, our models are able to infer vector representations as well as labels over unseen texts.", "abstract": "The recently introduced Paragraph Vector is an efficient method for learning high-quality distributed representations for pieces of texts. However, an inherent limitation of Paragraph Vector is lack of ability to infer distributed representations for texts outside of the training set. To tackle this problem, we introduce a Generative Paragraph Vector, which can be viewed as a probabilistic extension of the Distributed Bag of Words version of Paragraph Vector with a complete generative process. With the ability to infer the distributed representations for unseen texts, we can further incorporate text labels into the model and turn it into a supervised version, namely Supervised Generative Paragraph Vector. In this way, we can leverage the labels paired with the texts to guide the representation learning, and employ the learned model for prediction tasks directly. Experiments on five text classification benchmark collections show that both model architectures can yield superior classification performance over the state-of-the-art counterparts.\n", "keywords": "Natural language processing;Deep learning;Unsupervised Learning;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Ruqing Zhang;Jiafeng Guo;Yanyan Lan;Jun Xu;Xueqi Cheng", "authorids": "zhangruqing@software.ict.ac.cn;guojiafeng@ict.ac.cn;lanyanyan@ict.ac.cn;junxu@ict.ac.cn;cxq@ict.ac.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhang2017generative,\ntitle={Generative Paragraph Vector},\nauthor={Ruqing Zhang and Jiafeng Guo and Yanyan Lan and Jun Xu and Xueqi Cheng},\nyear={2017},\nurl={https://openreview.net/forum?id=ryT9R3Yxe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryT9R3Yxe", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13810083048204159919&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "ryTYxh5ll", "title": "CONTENT2VEC: SPECIALIZING JOINT REPRESENTATIONS OF PRODUCT IMAGES AND TEXT FOR THE TASK OF PRODUCT RECOMMENDATION", "track": "main", "status": "Reject", "tldr": "We propose a unified product embedded representation that is optimized for the task of retrieval-based product recommendation.", "abstract": "We propose a unified product embedded representation that is optimized for the task of retrieval-based product recommendation. We generate this representation using Content2Vec, a new deep architecture that merges product content infor- mation such as text and image and we analyze its performance on hard recom- mendation setups such as cold-start and cross-category recommendations. In the case of a normal recommendation regime where collaborative information signal is available we merge the product co-occurence information and propose a sec- ond architecture Content2vec+ and show its lift in performance versus non-hybrid approaches.", "keywords": "Applications", "primary_area": "", "supplementary_material": "", "author": "Thomas Nedelec;Elena Smirnova;Flavian Vasile", "authorids": "t.nedelec@criteo.com;e.smirnova@criteo.com;f.vasile@criteo.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnedelec2017contentvec,\ntitle={{CONTENT}2{VEC}: {SPECIALIZING} {JOINT} {REPRESENTATIONS} {OF} {PRODUCT} {IMAGES} {AND} {TEXT} {FOR} {THE} {TASK} {OF} {PRODUCT} {RECOMMENDATION}},\nauthor={Thomas Nedelec and Elena Smirnova and Flavian Vasile},\nyear={2017},\nurl={https://openreview.net/forum?id=ryTYxh5ll}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryTYxh5ll", "pdf_size": 0, "rating": "3;3;5", "confidence": "3;3;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13446300766152135118&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "ryUPiRvge", "title": "Extrapolation and learning equations", "track": "main", "status": "Workshop", "tldr": "We present the learning of analytical equation from data using a new forward network architecture.", "abstract": "In classical machine learning, regression is treated as a black box process of identifying a\nsuitable function from a hypothesis set without attempting to gain insight into the mechanism connecting inputs and outputs.\nIn the natural sciences, however, finding an interpretable function for a phenomenon is the prime goal as it allows to understand and generalize results. This paper proposes a novel type of function learning network, called equation learner (EQL), that can learn analytical expressions and is able to extrapolate to unseen domains. It is implemented as an end-to-end differentiable feed-forward network and allows for efficient gradient based training. Due to sparsity regularization concise interpretable expressions can be obtained. Often the true underlying source expression is identified.\n", "keywords": "Supervised Learning;Deep learning;Structured prediction", "primary_area": "", "supplementary_material": "", "author": "Georg Martius;Christoph H. Lampert", "authorids": "gmartius@ist.ac.at;chl@ist.ac.at", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2017making,\ntitle={Making Stochastic Neural Networks from Deterministic Ones},\nauthor={Kimin Lee and Jaehyung Kim and Song Chong and Jinwoo Shin},\nyear={2017},\nurl={https://openreview.net/forum?id=B1akgy9xx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=ryUPiRvge", "pdf_size": 0, "rating": "3;6;7", "confidence": "4;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 2, "corr_rating_confidence": -0.2773500981126145, "gs_citation": 222, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2392537206754527730&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "ryWKREqxx", "title": "Emergent Predication Structure in Vector Representations of Neural Readers", "track": "main", "status": "Reject", "tldr": "Provide some novel insights on reading comprehension models and boost the performance of those models", "abstract": "Reading comprehension is a question answering task where the answer is to be found in a given passage about entities and events not mentioned in general knowledge sources. A significant number of neural architectures for this task (neural readers) have recently been developed and evaluated on large cloze-style datasets. We present experiments supporting the emergence of \u201cpredication structure\u201d in the hidden state vectors of a class of neural readers including the Attentive Reader and Stanford Reader. We posits that the hidden state vectors can be viewed as (a representation of) a concatenation [P, c] of a \u201cpredicate vector\u201d P and a \u201cconstant symbol vector\u201d c and that the hidden state represents the atomic formula P(c). This predication structure plays a conceptual role in relating \u201caggregation readers\u201d such as the Attentive Reader and the Stanford Reader to \u201cexplicit reference readers\u201d such as the Attention-Sum Reader, the Gated-Attention Reader and the Attention-over-Attention Reader. In an independent contribution, we show that the addition of linguistics features to the input to existing neural readers significantly boosts performance yielding the best results to date on the Who-did-What dataset.", "keywords": "Natural language processing;Deep learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Hai Wang;Takeshi Onishi;Kevin Gimpel;David McAllester", "authorids": "haiwang@ttic.edu;tonishi@ttic.edu;kgimpel@ttic.edu;mcallester@ttic.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwang2017emergent,\ntitle={Emergent Predication Structure in Vector Representations of Neural Readers},\nauthor={Hai Wang and Takeshi Onishi and Kevin Gimpel and David McAllester},\nyear={2017},\nurl={https://openreview.net/forum?id=ryWKREqxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryWKREqxx", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;5;4", "rating_avg": 5.666666666666667, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6656729080647999339&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 0 }, { "id": "ryXZmzNeg", "title": "Improving Sampling from Generative Autoencoders with Markov Chains", "track": "main", "status": "Reject", "tldr": "Iteratively encoding and decoding samples from generative autoencoders recovers samples from the true latent distribution learned by the model", "abstract": "We focus on generative autoencoders, such as variational or adversarial autoencoders, which jointly learn a generative model alongside an inference model. Generative autoencoders are those which are trained to softly enforce a prior on the latent distribution learned by the inference model. We call the distribution to which the inference model maps observed samples, the learned latent distribution, which may not be consistent with the prior. We formulate a Markov chain Monte Carlo (MCMC) sampling process, equivalent to iteratively decoding and encoding, which allows us to sample from the learned latent distribution. Since, the generative model learns to map from the learned latent distribution, rather than the prior, we may use MCMC to improve the quality of samples drawn from the generative model, especially when the learned latent distribution is far from the prior. Using MCMC sampling, we are able to reveal previously unseen differences between generative autoencoders trained either with or without a denoising criterion.", "keywords": "Deep learning;Unsupervised Learning;Theory", "primary_area": "", "supplementary_material": "", "author": "Antonia Creswell;Kai Arulkumaran;Anil Anthony Bharath", "authorids": "ac2211@imperial.ac.uk;ka709@imperial.ac.uk;aab01@imperial.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncreswell2017improving,\ntitle={Improving Sampling from Generative Autoencoders with Markov Chains},\nauthor={Antonia Creswell and Kai Arulkumaran and Anil Anthony Bharath},\nyear={2017},\nurl={https://openreview.net/forum?id=ryXZmzNeg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryXZmzNeg", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7886143132090388826&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "ryZqPN5xe", "title": "Beyond Fine Tuning: A Modular Approach to Learning on Small Data", "track": "main", "status": "Reject", "tldr": "A better way to do deep learning with small amounts of training data", "abstract": "In this paper we present a technique to train neural network models on small amounts of data. Current methods for training neural networks on small amounts of rich data typically rely on strategies such as fine-tuning a pre-trained neural network or the use of domain-specific hand-engineered features. Here we take the approach of treating network layers, or entire networks, as modules and combine pre-trained modules with untrained modules, to learn the shift in distributions between data sets. The central impact of using a modular approach comes from adding new representations to a network, as opposed to replacing representations via fine-tuning. Using this technique, we are able surpass results using standard fine-tuning transfer learning approaches, and we are also able to significantly increase performance over such approaches when using smaller amounts of data. ", "keywords": "Deep learning;Supervised Learning;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Aryk Anderson;Kyle Shaffer;Artem Yankov;Court Corley;Nathan Hodas", "authorids": "aryk.anderson@eagles.ewu.edu;kyle.shaffer@pnnl.gov;artem.yankov@pnnl.gov;court@pnnl.gov;nathan.hodas@pnnl.gov", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nanderson2017beyond,\ntitle={Beyond Fine Tuning: A Modular Approach to Learning on Small Data},\nauthor={Aryk Anderson and Kyle Shaffer and Artem Yankov and Court Corley and Nathan Hodas},\nyear={2017},\nurl={https://openreview.net/forum?id=ryZqPN5xe}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryZqPN5xe", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;2;5", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": -0.18898223650461365, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=91123793007935710&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "ry_4vpixl", "title": "Rotation Plane Doubly Orthogonal Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "Recurrent equation for RNNs that uses the composition of two orthogonal transitions, one time invariant and one modulated by input, that doesn't suffer from vanishing or exploding gradients.", "abstract": "Recurrent Neural Networks (RNNs) applied to long sequences suffer from the well known vanishing and exploding gradients problem. The recently proposed Unitary Evolution Recurrent Neural Network (uRNN) alleviates the exploding gradient problem and can learn very long dependencies, but its nonlinearities make it still affected by the vanishing gradient problem and so learning can break down for extremely long dependencies. We propose a new RNN transition architecture where the hidden state is updated multiplicatively by a time invariant orthogonal transformation followed by an input modulated orthogonal transformation. There are no additive interactions and so our architecture exactly preserves forward hid-den state activation norm and backwards gradient norm for all time steps, and is provably not affected by vanishing or exploding gradients. We propose using the rotation plane parameterization to represent the orthogonal matrices. We validate our model on a simplified memory copy task and see that our model can learn dependencies as long as 5,000 timesteps.", "keywords": "Deep learning;Theory", "primary_area": "", "supplementary_material": "", "author": "Zoe McCarthy;Andrew Bai;Xi Chen;Pieter Abbeel", "authorids": "zmccarthy@berkeley.edu;;;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmccarthy2017rotation,\ntitle={Rotation Plane Doubly Orthogonal Recurrent Neural Networks},\nauthor={Zoe McCarthy and Andrew Bai and Xi Chen and Pieter Abbeel},\nyear={2017},\nurl={https://openreview.net/forum?id=ry_4vpixl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ry_4vpixl", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "rating_avg": 4.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FyNwGHrmIvEJ:scholar.google.com/&scioq=Rotation+Plane+Doubly+Orthogonal+Recurrent+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "ry_sjFqgx", "title": "Program Synthesis for Character Level Language Modeling", "track": "main", "status": "Poster", "tldr": "", "abstract": "We propose a statistical model applicable to character level language modeling and show that it is a good fit for both, program source code and English text. The model is parameterized by a program from a domain-specific language (DSL) that allows expressing non-trivial data dependencies. Learning is done in two phases: (i) we synthesize a program from the DSL, essentially learning a good representation for the data, and (ii) we learn parameters from the training data - the process is done via counting, as in simple language models such as n-gram.\n\nOur experiments show that the precision of our model is comparable to that of neural networks while sharing a number of advantages with n-gram models such as fast query time and the capability to quickly add and remove training data samples. Further, the model is parameterized by a program that can be manually inspected, understood and updated, addressing a major problem of neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pavol Bielik;Veselin Raychev;Martin Vechev", "authorids": "pavol.bielik@inf.ethz.ch;veselin.raychev@inf.ethz.ch;martin.vechev@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbielik2017program,\ntitle={Program Synthesis for Character Level Language Modeling},\nauthor={Pavol Bielik and Veselin Raychev and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ry_sjFqgx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ry_sjFqgx", "pdf_size": 0, "rating": "5;8;8", "confidence": "3;2;4", "rating_avg": 7.0, "confidence_avg": 3.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4968856614184858481&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "ryaFG5ige", "title": "Introducing Active Learning for CNN under the light of Variational Inference", "track": "main", "status": "Reject", "tldr": "Building automatically the labeled training set with active learning for CNN. The criterion is developed on a variational inference for NN and a kronecker approximation of Fisher matrices for CNN", "abstract": "One main concern of the deep learning community is to increase the capacity of\nrepresentation of deep networks by increasing their depth. This requires to scale\nup the size of the training database accordingly. Indeed a major intuition lies in\nthe fact that the depth of the network and the size of the training set are strongly\ncorrelated. However recent works tend to show that deep learning may be handled\nwith smaller dataset as long as the training samples are carefully selected (let us\nmention for instance curriculum learning). In this context we introduce a scalable\nand efficient active learning method that can be applied to most neural networks,\nespecially Convolutional Neural Networks (CNN). To the best of our knowledge,\nthis paper is the first of its kind to design an active learning selection scheme based\non a variational inference for neural networks. We also deduced a formulation of\nthe posterior and prior distributions of the weights using statistical knowledge on\nthe Maximum Likelihood Estimator.\nWe describe our strategy to come up with our active learning criterion. We assess its\nconsistency by checking the accuracy obtained by successive active learning steps\non two benchmark datasets MNIST and USPS. We also demonstrate its scalability\ntowards increasing training set size.", "keywords": "Deep learning;Supervised Learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Melanie Ducoffe;Frederic Precioso", "authorids": "ducoffe@i3s.unice.fr;precioso@i3s.unice.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nducoffe2017introducing,\ntitle={Introducing Active Learning for {CNN} under the light of Variational Inference},\nauthor={Melanie Ducoffe and Frederic Precioso},\nyear={2017},\nurl={https://openreview.net/forum?id=ryaFG5ige}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=ryaFG5ige", "pdf_size": 0, "rating": "6;6;6", "confidence": "2;2;1", "rating_avg": 6.0, "confidence_avg": 1.6666666666666667, "replies_avg": 20, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:12K_8vIqMw8J:scholar.google.com/&scioq=Introducing+Active+Learning+for+CNN+under+the+light+of+Variational+Inference&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "ryb-q1Olg", "title": "Rectified Factor Networks for Biclustering", "track": "main", "status": "Reject", "tldr": "", "abstract": "Biclustering is evolving into one of the major tools for analyzing large datasets given as matrix of samples times features. Biclustering has several noteworthy applications and has been successfully applied in life sciences and e-commerce for drug design and recommender systems, respectively.\n\nFABIA is one of the most successful biclustering methods and is used by companies like Bayer, Janssen, or Zalando. FABIA is a generative model that represents each bicluster by two sparse membership vectors: one for the samples and one for the features. However, FABIA is restricted to about 20 code units because of the high computational complexity of computing the posterior. Furthermore, code units are sometimes insufficiently decorrelated. Sample membership is difficult to determine because vectors do not have exact zero entries and can have both large positive and large negative values.\n\nWe propose to use the recently introduced unsupervised Deep Learning approach Rectified Factor Networks (RFNs) to overcome the drawbacks of existing biclustering methods. RFNs efficiently construct very sparse, non-linear, high-dimensional representations of the input via their posterior means. RFN learning is a generalized alternating minimization algorithm based on the posterior regularization method which enforces non-negative and normalized posterior means. Each code unit represents a bicluster, where samples for which the code unit is active belong to the bicluster and features that have activating weights to the code unit belong to the bicluster.\n\nOn 400 benchmark datasets with artificially implanted biclusters, RFN significantly outperformed 13 other biclustering competitors including FABIA. In biclustering experiments on three gene expression datasets with known clusters that were determined by separate measurements, RFN biclustering was two times significantly better than the other 13 methods and once on second place. On data of the 1000 Genomes Project, RFN could identify DNA segments which indicate, that interbreeding with other hominins starting already before ancestors of modern humans left Africa.", "keywords": "Deep learning;Unsupervised Learning;Applications", "primary_area": "", "supplementary_material": "", "author": "Djork-Arn\u00e9 Clevert;Thomas Unterthiner;Sepp Hochreiter", "authorids": "okko@bioinf.jku.at;unterthiner@bioinf.jku.at;hochreit@bioinf.jku.at", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nclevert2017rectified,\ntitle={Rectified Factor Networks for Biclustering},\nauthor={Djork-Arn{\\'e} Clevert and Thomas Unterthiner and Sepp Hochreiter},\nyear={2017},\nurl={https://openreview.net/forum?id=ryb-q1Olg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryb-q1Olg", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;2;2", "rating_avg": 4.666666666666667, "confidence_avg": 2.6666666666666665, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aXlO1Y5cWKgJ:scholar.google.com/&scioq=Rectified+Factor+Networks+for+Biclustering&hl=en&as_sdt=0,33", "gs_version_total": 3 }, { "id": "rye9LT8cee", "title": "Alternating Direction Method of Multipliers for Sparse Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "A method to sparsify (prune) pre-trained deep neural networks.", "abstract": "The storage and computation requirements of Convolutional Neural Networks (CNNs) can be prohibitive for exploiting these models over low-power or embedded devices. This paper reduces the computational complexity of the CNNs by minimizing an objective function, including the recognition loss that is augmented with a sparsity-promoting penalty term. The sparsity structure of the network is identified using the Alternating Direction Method of Multipliers (ADMM), which is widely used in large optimization problems. This method alternates between promoting the sparsity of the network and optimizing the recognition performance, which allows us to exploit the two-part structure of the corresponding objective functions. In particular, we take advantage of the separability of the sparsity-inducing penalty functions to decompose the minimization problem into sub-problems that can be solved sequentially. Applying our method to a variety of state-of-the-art CNN models, our proposed method is able to simplify the original model, generating models with less computation and fewer parameters, while maintaining and often improving generalization performance. Accomplishments on a variety of models strongly verify that our proposed ADMM-based method can be a very useful tool for simplifying and improving deep CNNs. ", "keywords": "Deep learning;Computer vision;Optimization", "primary_area": "", "supplementary_material": "", "author": "Farkhondeh Kiaee;Christian Gagn\u00e9;and Mahdieh Abbasi", "authorids": "farkhondeh.kiaee.1@ulaval.ca;christian.gagne@gel.ulaval.ca;mahdieh.abbasi.1@ulaval.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkiaee2017alternating,\ntitle={Alternating Direction Method of Multipliers for Sparse Convolutional Neural Networks},\nauthor={Farkhondeh Kiaee and Christian Gagn{\\'e} and and Mahdieh Abbasi},\nyear={2017},\nurl={https://openreview.net/forum?id=rye9LT8cee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=rye9LT8cee", "pdf_size": 0, "rating": "5;6;7;7", "confidence": "3;4;5;3", "rating_avg": 6.25, "confidence_avg": 3.75, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": 0.4545454545454545, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=858247473313576525&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "ryelgY5eg", "title": "Optimal Binary Autoencoding with Pairwise Correlations", "track": "main", "status": "Poster", "tldr": "Efficient biconvex learning of binary autoencoders, using pairwise correlations between encodings and decodings, is strongly optimal.", "abstract": "We formulate learning of a binary autoencoder as a biconvex optimization problem which learns from the pairwise correlations between encoded and decoded bits. Among all possible algorithms that use this information, ours finds the autoencoder that reconstructs its inputs with worst-case optimal loss. The optimal decoder is a single layer of artificial neurons, emerging entirely from the minimax loss minimization, and with weights learned by convex optimization. All this is reflected in competitive experimental results, demonstrating that binary autoencoding can be done efficiently by conveying information in pairwise correlations in an optimal fashion. ", "keywords": "Theory;Unsupervised Learning;Games", "primary_area": "", "supplementary_material": "", "author": "Akshay Balsubramani", "authorids": "abalsubr@stanford.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nbalsubramani2017optimal,\ntitle={Optimal Binary Autoencoding with Pairwise Correlations},\nauthor={Akshay Balsubramani},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ryelgY5eg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryelgY5eg", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;2", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 6, "authors#_avg": 1, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17789535936754794040&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "ryh9pmcee", "title": "Energy-based Generative Adversarial Networks", "track": "main", "status": "Poster", "tldr": "We introduce the \"Energy-based Generative Adversarial Network\" (EBGAN) model.", "abstract": "We introduce the \"Energy-based Generative Adversarial Network\" model (EBGAN) which views the discriminator as an energy function that attributes low energies to the regions near the data manifold and higher energies to other regions. Similar to the probabilistic GANs, a generator is seen as being trained to produce contrastive samples with minimal energies, while the discriminator is trained to assign high energies to these generated samples. Viewing the discriminator as an energy function allows to use a wide variety of architectures and loss functionals in addition to the usual binary classifier with logistic output. Among them, we show one instantiation of EBGAN framework as using an auto-encoder architecture, with the energy being the reconstruction error, in place of the discriminator. We show that this form of EBGAN exhibits more stable behavior than regular GANs during training. We also show that a single-scale architecture can be trained to generate high-resolution images.", "keywords": "Deep learning;Unsupervised Learning;Semi-Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Junbo Zhao;Michael Mathieu;Yann LeCun", "authorids": "jakezhao@cs.nyu.edu;mathieu@cs.nyu.edu;yann@cs.nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhao2017energybased,\ntitle={Energy-based Generative Adversarial Networks},\nauthor={Junbo Zhao and Michael Mathieu and Yann LeCun},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ryh9pmcee}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryh9pmcee", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;3;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1681, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15426746467469595309&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "ryh_8f9lg", "title": "Classless Association using Neural Networks", "track": "main", "status": "Reject", "tldr": "Learning based on the relation between two instances of the same unknown class", "abstract": "The goal of this paper is to train a model based on the relation between two instances that represent the same unknown class. This scenario is inspired by the Symbol Grounding Problem and the association learning in infants. We propose a novel model called Classless Association. It has two parallel Multilayer Perceptrons (MLP) that uses one network as a target of the other network, and vice versa. In addition, the presented model is trained based on an EM-approach, in which the output vectors are matched against a statistical distribution. We generate four classless datasets based on MNIST, where the input is two different instances of the same digit. In addition, the digits have a uniform distribution. Furthermore, our classless association model is evaluated against two scenarios: totally supervised and totally unsupervised. In the first scenario, our model reaches a good performance in terms of accuracy and the classless constraint. In the second scenario, our model reaches better results against two clustering algorithms.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Federico Raue;Sebastian Palacio;Andreas Dengel;Marcus Liwicki", "authorids": "federico.raue@dfki.de;sebastian.palacio@dfki.de;andreas.dengel@dfki.de;liwicki@cs.uni-kl.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nraue2017classless,\ntitle={Classless Association using Neural Networks},\nauthor={Federico Raue and Sebastian Palacio and Andreas Dengel and Marcus Liwicki},\nyear={2017},\nurl={https://openreview.net/forum?id=ryh_8f9lg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryh_8f9lg", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7706306934512908828&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ryhqQFKgl", "title": "Towards Deep Interpretability (MUS-ROVER II): Learning Hierarchical Representations of Tonal Music", "track": "main", "status": "Poster", "tldr": "", "abstract": "Music theory studies the regularity of patterns in music to capture concepts underlying music styles and composers' decisions. This paper continues the study of building \\emph{automatic theorists} (rovers) to learn and represent music concepts that lead to human interpretable knowledge and further lead to materials for educating people. Our previous work took a first step in algorithmic concept learning of tonal music, studying high-level representations (concepts) of symbolic music (scores) and extracting interpretable rules for composition. This paper further studies the representation \\emph{hierarchy} through the learning process, and supports \\emph{adaptive} 2D memory selection in the resulting language model. This leads to a deeper-level interpretability that expands from individual rules to a dynamic system of rules, making the entire rule learning process more cognitive. The outcome is a new rover, MUS-ROVER \\RN{2}, trained on Bach's chorales, which outputs customizable syllabi for learning compositional rules. We demonstrate comparable results to our music pedagogy, while also presenting the differences and variations. In addition, we point out the rover's potential usages in style recognition and synthesis, as well as applications beyond music.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haizi Yu;Lav R. Varshney", "authorids": "haiziyu7@illinois.edu;varshney@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nyu2017towards,\ntitle={Towards Deep Interpretability ({MUS}-{ROVER} {II}): Learning Hierarchical Representations of Tonal Music},\nauthor={Haizi Yu and Lav R. Varshney},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ryhqQFKgl}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryhqQFKgl", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.3333333333333335, "replies_avg": 15, "authors#_avg": 2, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17198945779695545354&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ryjp1c9xg", "title": "Extensions and Limitations of the Neural GPU", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Neural GPU is a recent model that can learn algorithms such as multi-digit binary addition and binary multiplication in a way that generalizes to inputs of arbitrary length. We show that there are two simple ways of improving the performance of the Neural GPU: by carefully designing a curriculum, and by increasing model size. The latter requires a memory efficient implementation, as a naive implementation of the Neural GPU is memory intensive. We find that these techniques increase the set of algorithmic problems that can be solved by the Neural GPU: we have been able to learn to perform all the arithmetic operations (and generalize to arbitrarily long numbers) when the arguments are given in the decimal representation (which, surprisingly, has not been possible before). We have also been able to train the Neural GPU to evaluate long arithmetic expressions with multiple operands that require respecting the precedence order of the operands, although these have succeeded only in their binary representation, and not with perfect accuracy.\n\nIn addition, we gain insight into the Neural GPU by investigating its failure modes. We find that Neural GPUs that correctly generalize to arbitrarily long numbers still fail to compute the correct answer on highly-symmetric, atypical inputs: for example, a Neural GPU that achieves near-perfect generalization on decimal multiplication of up to 100-digit long numbers can fail on $000000\\dots002 \\times 000000\\dots002$ while succeeding at $2 \\times 2$. These failure modes are reminiscent of adversarial examples.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Eric Price;Wojciech Zaremba;Ilya Sutskever", "authorids": "ecprice@cs.utexas.edu;woj@openai.com;ilyasu@openai.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nprice2017extensions,\ntitle={Extensions and Limitations of the Neural {GPU}},\nauthor={Eric Price and Wojciech Zaremba and Ilya Sutskever},\nyear={2017},\nurl={https://openreview.net/forum?id=ryjp1c9xg}\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryjp1c9xg", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "rating_avg": 4.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18409205007825405582&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "ryrGawqex", "title": "Deep Learning with Dynamic Computation Graphs", "track": "main", "status": "Poster", "tldr": "We make batching effective and easy to use for neural nets where every input may have a different shape (e.g. TreeRNNs).", "abstract": "Neural networks that compute over graph structures are a natural fit for problems in a variety of domains, including natural language (parse trees) and cheminformatics (molecular graphs). However, since the computation graph has a different shape and size for every input, such networks do not directly support batched training or inference. They are also difficult to implement in popular deep learning libraries, which are based on static data-flow graphs. We introduce a technique called dynamic batching, which not only batches together operations between different input graphs of dissimilar shape, but also between different nodes within a single input graph. The technique allows us to create static graphs, using popular libraries, that emulate dynamic computation graphs of arbitrary shape and size. We further present a high-level library of compositional blocks that simplifies the creation of dynamic graph models. Using the library, we demonstrate concise and batch-wise parallel implementations for a variety of models from the literature.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Moshe Looks;Marcello Herreshoff;DeLesley Hutchins;Peter Norvig", "authorids": "madscience@google.com;marcelloh@google.com;delesley@google.com;pnorvig@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlooks2017deep,\ntitle={Deep Learning with Dynamic Computation Graphs},\nauthor={Moshe Looks and Marcello Herreshoff and DeLesley Hutchins and Peter Norvig},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ryrGawqex}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryrGawqex", "pdf_size": 0, "rating": "7;8;8", "confidence": "5;3;3", "rating_avg": 7.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17598784344933868555&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "ryuxYmvel", "title": "HolStep: A Machine Learning Dataset for Higher-order Logic Theorem Proving", "track": "main", "status": "Poster", "tldr": "", "abstract": "Large computer-understandable proofs consist of millions of intermediate\nlogical steps. The vast majority of such steps originate from manually\nselected and manually guided heuristics applied to intermediate goals.\nSo far, machine learning has generally not been used to filter or\ngenerate these steps. In this paper, we introduce a new dataset based on\nHigher-Order Logic (HOL) proofs, for the purpose of developing new\nmachine learning-based theorem-proving strategies. We make this dataset\npublicly available under the BSD license. We propose various machine\nlearning tasks that can be performed on this dataset, and discuss their\nsignificance for theorem proving. We also benchmark a set of simple baseline\nmachine learning models suited for the tasks (including logistic regression\nconvolutional neural networks and recurrent neural networks). The results of our\nbaseline models show the promise of applying machine learning to HOL\ntheorem proving.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Cezary Kaliszyk;Fran\u00e7ois Chollet;Christian Szegedy", "authorids": "cezary.kaliszyk@uibk.ac.at;fchollet@google.com;szegedy@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkaliszyk2017holstep,\ntitle={HolStep: A Machine Learning Dataset for Higher-order Logic Theorem Proving},\nauthor={Cezary Kaliszyk and Fran{\\c{c}}ois Chollet and Christian Szegedy},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ryuxYmvel}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryuxYmvel", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;3", "rating_avg": 7.0, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14476774781002042104&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "rywUcQogx", "title": "Differentiable Canonical Correlation Analysis", "track": "main", "status": "Reject", "tldr": "We propose Differentiable CCA a formulation of CCA that enables gradient flow through the computation of the CCA projection matrices.", "abstract": "Canonical Correlation Analysis (CCA) computes maximally-correlated \nlinear projections of two modalities. We propose Differentiable CCA, a \nformulation of CCA that can be cast as a layer within a multi-view \nneural network. Unlike Deep CCA, an earlier extension of CCA to \nnonlinear projections, our formulation enables gradient flow through the \ncomputation of the CCA projection matrices, and free choice of the final \noptimization target. We show the effectiveness of this approach in \ncross-modality retrieval experiments on two public image-to-text \ndatasets, surpassing both Deep CCA and a multi-view network with \nfreely-learned projections. We assume that Differentiable CCA could be a \nuseful building block for many multi-modality tasks.", "keywords": "Multi-modal learning", "primary_area": "", "supplementary_material": "", "author": "Matthias Dorfer;Jan Schl\u00fcter;Gerhard Widmer", "authorids": "matthias.dorfer@jku.at;jan.schlueter@ofai.at;gerhard.widmer@jku.at", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndorfer2017differentiable,\ntitle={Differentiable Canonical Correlation Analysis},\nauthor={Matthias Dorfer and Jan Schl{\\\"u}ter and Gerhard Widmer},\nyear={2017},\nurl={https://openreview.net/forum?id=rywUcQogx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rywUcQogx", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9XSgix0GwcAJ:scholar.google.com/&scioq=Differentiable+Canonical+Correlation+Analysis&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "ryxB0Rtxx", "title": "Identity Matters in Deep Learning", "track": "main", "status": "Poster", "tldr": "Emerging theory explaining residual networks alongside new empirical progress", "abstract": "An emerging design principle in deep learning is that each layer of a deep\nartificial neural network should be able to easily express the identity\ntransformation. This idea not only motivated various normalization techniques,\nsuch as batch normalization, but was also key to the immense success of\nresidual networks.\n\nIn this work, we put the principle of identity parameterization on a more \nsolid theoretical footing alongside further empirical progress. We first\ngive a strikingly simple proof that arbitrarily deep linear residual networks\nhave no spurious local optima. The same result for feed-forward networks in\ntheir standard parameterization is substantially more delicate. Second, we\nshow that residual networks with ReLu activations have universal finite-sample\nexpressivity in the sense that the network can represent any function of its\nsample provided that the model has more parameters than the sample size.\n\nDirectly inspired by our theory, we experiment with a radically simple\nresidual architecture consisting of only residual convolutional layers and\nReLu activations, but no batch normalization, dropout, or max pool. Our model\nimproves significantly on previous all-convolutional networks on the CIFAR10,\nCIFAR100, and ImageNet classification benchmarks.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Moritz Hardt;Tengyu Ma", "authorids": "m@mrtz.org;tengyu@cs.princeton.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhardt2017identity,\ntitle={Identity Matters in Deep Learning},\nauthor={Moritz Hardt and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2017},\nurl={https://openreview.net/forum?id=ryxB0Rtxx}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryxB0Rtxx", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;5;3", "rating_avg": 6.333333333333333, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -0.6546536707079772, "gs_citation": 458, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13607975019730988794&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 } ]