Dynamic Mixtures of Contextual Experts For Language Modeling
Dynamic Mixtures of Contextual Experts For Language Modeling
Dynamic Mixtures of Contextual Experts For Language Modeling
x̂i x̂tbi ↵
x̂tci
<latexit sha1_base64="GK3zHEr8sjfzP8R4AYe+UCdZTR8=">AAAB7HicbVBNS8NAEJ34WetX1aOXxSJ4KokI6q3oxWMF0xbaUCbbTbt2swm7G6GE/gcvHlS8+oO8+W/ctjlo64OBx3szzMwLU8G1cd1vZ2V1bX1js7RV3t7Z3duvHBw2dZIpynyaiES1Q9RMcMl8w41g7VQxjEPBWuHoduq3npjSPJEPZpyyIMaB5BGnaKzU7KJIh9irVN2aOwNZJl5BqlCg0at8dfsJzWImDRWodcdzUxPkqAyngk3K3UyzFOkIB6xjqcSY6SCfXTshp1bpkyhRtqQhM/X3RI6x1uM4tJ0xmqFe9Kbif14nM9FVkHOZZoZJOl8UZYKYhExfJ32uGDVibAlSxe2thA5RITU2oLINwVt8eZn457Xrmnt/Ua3fFGmU4BhO4Aw8uIQ63EEDfKDwCM/wCm9O4rw4787HvHXFKWaO4A+czx/3l47r</latexit>
<latexit
Dense Dense
<latexit sha1_base64="j9tjG3Vq0Axj+jVfDB1+/HMx97U=">AAAB93icbVBNS8NAEN34WetHox69LBahXkoqgnorevFYwdhCG8Nmu2mXbjZhdyLWkF/ixYOKV/+KN/+N2zYHbX0w8Hhvhpl5QSK4Bsf5tpaWV1bX1ksb5c2t7Z2Kvbt3p+NUUebSWMSqExDNBJfMBQ6CdRLFSBQI1g5GVxO//cCU5rG8hXHCvIgMJA85JWAk3670hgSyx9zn91kNjnPfrjp1Zwq8SBoFqaICLd/+6vVjmkZMAhVE627DScDLiAJOBcvLvVSzhNARGbCuoZJETHvZ9PAcHxmlj8NYmZKAp+rviYxEWo+jwHRGBIZ63puI/3ndFMJzL+MySYFJOlsUpgJDjCcp4D5XjIIYG0Ko4uZWTIdEEQomq7IJoTH/8iJxT+oXdefmtNq8LNIooQN0iGqogc5QE12jFnIRRSl6Rq/ozXqyXqx362PWumQVM/voD6zPHzskkwA=</latexit>
<latexit sha1_base64="4RIjWZuJGduiqwERXT/98DNcXVI=">AAAB9HicbVBNS8NAEJ34WetX1aOXYBE8lUQE9Vb04rGCsYU2LZvtpl262YTdiVpC/ocXDype/THe/Ddu2xy09cHA470ZZuYFieAaHefbWlpeWV1bL22UN7e2d3Yre/v3Ok4VZR6NRaxaAdFMcMk85ChYK1GMRIFgzWB0PfGbD0xpHss7HCfMj8hA8pBTgkbqdoYEs6e8lwU872KvUnVqzhT2InELUoUCjV7lq9OPaRoxiVQQrduuk6CfEYWcCpaXO6lmCaEjMmBtQyWJmPaz6dW5fWyUvh3GypREe6r+nshIpPU4CkxnRHCo572J+J/XTjG88DMukxSZpLNFYSpsjO1JBHafK0ZRjA0hVHFzq02HRBGKJqiyCcGdf3mReKe1y5pze1atXxVplOAQjuAEXDiHOtxAAzygoOAZXuHNerRerHfrY9a6ZBUzB/AH1ucPs46S1g==</latexit>
<latexit sha1_base64="5b0QXg/G91xZcrXEbepDHLIeKB8=">AAAB9HicbVBNS8NAEJ34WetX1aOXYBE8lUQE9Vb04rGCsYU2LZvtpl262YTdiVpC/ocXDype/THe/Ddu2xy09cHA470ZZuYFieAaHefbWlpeWV1bL22UN7e2d3Yre/v3Ok4VZR6NRaxaAdFMcMk85ChYK1GMRIFgzWB0PfGbD0xpHss7HCfMj8hA8pBTgkbqdoYEs6e8l1Ged7FXqTo1Zwp7kbgFqUKBRq/y1enHNI2YRCqI1m3XSdDPiEJOBcvLnVSzhNARGbC2oZJETPvZ9OrcPjZK3w5jZUqiPVV/T2Qk0nocBaYzIjjU895E/M9rpxhe+BmXSYpM0tmiMBU2xvYkArvPFaMoxoYQqri51aZDoghFE1TZhODOv7xIvNPaZc25PavWr4o0SnAIR3ACLpxDHW6gAR5QUPAMr/BmPVov1rv1MWtdsoqZA/gD6/MHtRWS1w==</latexit>
<latexit sha1_base64="gL0YGuKKDAeqwI4pwGVjyfWQx1A=">AAAB8nicbVBNS8NAEJ3Ur1q/qh69BItQLyURQb0VvXisYGyhjWWz3bRLN5uwOxFKyN/w4kHFq7/Gm//GbZuDtj4YeLw3w8y8IBFco+N8W6WV1bX1jfJmZWt7Z3evun/woONUUebRWMSqExDNBJfMQ46CdRLFSBQI1g7GN1O//cSU5rG8x0nC/IgMJQ85JWik3qif8SB/zOp4mverNafhzGAvE7cgNSjQ6le/eoOYphGTSAXRuus6CfoZUcipYHmll2qWEDomQ9Y1VJKIaT+b3ZzbJ0YZ2GGsTEm0Z+rviYxEWk+iwHRGBEd60ZuK/3ndFMNLP+MySZFJOl8UpsLG2J4GYA+4YhTFxBBCFTe32nREFKFoYqqYENzFl5eJd9a4ajh357XmdZFGGY7gGOrgwgU04RZa4AGFBJ7hFd6s1Hqx3q2PeWvJKmYO4Q+szx9Vq5Fq</latexit>
hic
LSTM
<latexit sha1_base64="B0E2KD7w2hjJs1UpnGrCZrmjvFw=">AAAB8nicbVBNS8NAEJ3Ur1q/qh69BItQLyURQb0VvXisYGyhjWWz3bRLN5uwOxFKyN/w4kHFq7/Gm//GbZuDtj4YeLw3w8y8IBFco+N8W6WV1bX1jfJmZWt7Z3evun/woONUUebRWMSqExDNBJfMQ46CdRLFSBQI1g7GN1O//cSU5rG8x0nC/IgMJQ85JWik3qifcZo/ZnU8zfvVmtNwZrCXiVuQGhRo9atfvUFM04hJpIJo3XWdBP2MKORUsLzSSzVLCB2TIesaKknEtJ/Nbs7tE6MM7DBWpiTaM/X3REYirSdRYDojgiO96E3F/7xuiuGln3GZpMgknS8KU2FjbE8DsAdcMYpiYgihiptbbToiilA0MVVMCO7iy8vEO2tcNZy781rzukijDEdwDHVw4QKacAst8IBCAs/wCm9War1Y79bHvLVkFTOH8AfW5w9XNZFr</latexit>
LSTM (t 1)
LSTM b <latexit sha1_base64="fLcFuau32Ix18DXh1X/nqwYgiJk=">AAAB53icbVBNS8NAEJ34WetX1aOXxSJ4KokI6q3oxWMLxhbaUDbbSbt2swm7G6GE/gIvHlS8+pe8+W/ctjlo64OBx3szzMwLU8G1cd1vZ2V1bX1js7RV3t7Z3duvHBw+6CRTDH2WiES1Q6pRcIm+4UZgO1VI41BgKxzdTv3WEyrNE3lvxikGMR1IHnFGjZWaYa9SdWvuDGSZeAWpQoFGr/LV7Scsi1EaJqjWHc9NTZBTZTgTOCl3M40pZSM6wI6lksaog3x26IScWqVPokTZkobM1N8TOY21Hseh7YypGepFbyr+53UyE10FOZdpZlCy+aIoE8QkZPo16XOFzIixJZQpbm8lbEgVZcZmU7YheIsvLxP/vHZd85oX1fpNkUYJjuEEzsCDS6jDHTTABwYIz/AKb86j8+K8Ox/z1hWnmDmCP3A+fwAzXYy6</latexit>
<latexit
LSTM c <latexit sha1_base64="dH/5ouAzGwbHNs4b529AlentnSk=">AAAB53icbVBNS8NAEJ34WetX1aOXxSJ4KokI6q3oxWMLxhbaUDbbSbt2swm7G6GE/gIvHlS8+pe8+W/ctjlo64OBx3szzMwLU8G1cd1vZ2V1bX1js7RV3t7Z3duvHBw+6CRTDH2WiES1Q6pRcIm+4UZgO1VI41BgKxzdTv3WEyrNE3lvxikGMR1IHnFGjZWarFepujV3BrJMvIJUoUCjV/nq9hOWxSgNE1TrjuemJsipMpwJnJS7mcaUshEdYMdSSWPUQT47dEJOrdInUaJsSUNm6u+JnMZaj+PQdsbUDPWiNxX/8zqZia6CnMs0MyjZfFGUCWISMv2a9LlCZsTYEsoUt7cSNqSKMmOzKdsQvMWXl4l/Xruuec2Lav2mSKMEx3ACZ+DBJdThDhrgAwOEZ3iFN+fReXHenY9564pTzBzBHzifPzTgjLs=</latexit>
<latexit
zi (t 1)
(t 1) <latexit sha1_base64="Eo3+Lhr45MP4p/vSCoQhqEnkVKI=">AAAB8XicbVBNSwMxEM3Wr1q/qh69BItQD5ZdEdRb0YvHCq4ttGvJptk2NJssyaxQl/4MLx5UvPpvvPlvTNs9aOuDgcd7M8zMCxPBDbjut1NYWl5ZXSuulzY2t7Z3yrt790almjKfKqF0KySGCS6ZDxwEayWakTgUrBkOryd+85Fpw5W8g1HCgpj0JY84JWCl9lOXP2RVOPGOx91yxa25U+BF4uWkgnI0uuWvTk/RNGYSqCDGtD03gSAjGjgVbFzqpIYlhA5Jn7UtlSRmJsimJ4/xkVV6OFLalgQ8VX9PZCQ2ZhSHtjMmMDDz3kT8z2unEF0EGZdJCkzS2aIoFRgUnvyPe1wzCmJkCaGa21sxHRBNKNiUSjYEb/7lReKf1i5r7u1ZpX6Vp1FEB+gQVZGHzlEd3aAG8hFFCj2jV/TmgPPivDsfs9aCk8/soz9wPn8AyKmQdg==</latexit>
zi
xi (t 1) (t 1) (t 1)
ci
<latexit sha1_base64="Eo3+Lhr45MP4p/vSCoQhqEnkVKI=">AAAB8XicbVBNSwMxEM3Wr1q/qh69BItQD5ZdEdRb0YvHCq4ttGvJptk2NJssyaxQl/4MLx5UvPpvvPlvTNs9aOuDgcd7M8zMCxPBDbjut1NYWl5ZXSuulzY2t7Z3yrt790almjKfKqF0KySGCS6ZDxwEayWakTgUrBkOryd+85Fpw5W8g1HCgpj0JY84JWCl9lOXP2RVOPGOx91yxa25U+BF4uWkgnI0uuWvTk/RNGYSqCDGtD03gSAjGjgVbFzqpIYlhA5Jn7UtlSRmJsimJ4/xkVV6OFLalgQ8VX9PZCQ2ZhSHtjMmMDDz3kT8z2unEF0EGZdJCkzS2aIoFRgUnvyPe1wzCmJkCaGa21sxHRBNKNiUSjYEb/7lReKf1i5r7u1ZpX6Vp1FEB+gQVZGHzlEd3aAG8hFFCj2jV/TmgPPivDsfs9aCk8/soz9wPn8AyKmQdg==</latexit>
xi <latexit sha1_base64="RS/sOSFDP3IpuT+VMRM+0GqPjSM=">AAAB8XicbVBNS8NAEN34WetX1aOXxSLUgyURQb0VvXisYGwhjWWz3bRLN7thdyKU0J/hxYOKV/+NN/+N2zYHbX0w8Hhvhpl5USq4Adf9dpaWV1bX1ksb5c2t7Z3dyt7+g1GZpsynSijdjohhgkvmAwfB2qlmJIkEa0XDm4nfemLacCXvYZSyMCF9yWNOCVgpoF3+mNfg1DsZdytVt+5OgReJV5AqKtDsVr46PUWzhEmgghgTeG4KYU40cCrYuNzJDEsJHZI+CyyVJGEmzKcnj/GxVXo4VtqWBDxVf0/kJDFmlES2MyEwMPPeRPzPCzKIL8OcyzQDJulsUZwJDApP/sc9rhkFMbKEUM3trZgOiCYUbEplG4I3//Ii8c/qV3X37rzauC7SKKFDdIRqyEMXqIFuURP5iCKFntErenPAeXHenY9Z65JTzBygP3A+fwCk/pBf</latexit>
xi ci
(t 1)
<latexit sha1_base64="RS/sOSFDP3IpuT+VMRM+0GqPjSM=">AAAB8XicbVBNS8NAEN34WetX1aOXxSLUgyURQb0VvXisYGwhjWWz3bRLN7thdyKU0J/hxYOKV/+NN/+N2zYHbX0w8Hhvhpl5USq4Adf9dpaWV1bX1ksb5c2t7Z3dyt7+g1GZpsynSijdjohhgkvmAwfB2qlmJIkEa0XDm4nfemLacCXvYZSyMCF9yWNOCVgpoF3+mNfg1DsZdytVt+5OgReJV5AqKtDsVr46PUWzhEmgghgTeG4KYU40cCrYuNzJDEsJHZI+CyyVJGEmzKcnj/GxVXo4VtqWBDxVf0/kJDFmlES2MyEwMPPeRPzPCzKIL8OcyzQDJulsUZwJDApP/sc9rhkFMbKEUM3trZgOiCYUbEplG4I3//Ii8c/qV3X37rzauC7SKKFDdIRqyEMXqIFuURP5iCKFntErenPAeXHenY9Z65JTzBygP3A+fwCk/pBf</latexit>
Embedding
Embedding Embedding Embedding Embedding
1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1
Figure 1: Representation of the three models Discussed. MCE is a mixture of the RNN LM and the version
that concatenates the two.
P
JJ
VB
(
TO
C
)
use the Patriots 8 and sweden subreddits as contexts.
JJ
VB
N
VB
C
N
Adrian <unk> makes my balls flexible . I saw you attempted to realize that no play against the Pats .
Patriots
I like Scully <unk> <unk> . I like how he could endorse SB to win writer and articulate this
team with reasons I think .
I like with Rory and <unk> winning the season
I like how Brady 's leverage for the press
This is what happens HYPE <unk> This is what happens , until the summer of ours that make the playoffs
turns in a corner , and the Shane fans are receiving escape games .
This is what happens !
This is what happens when you score in an angle .
I 've moved up to work in 7th grade 1 and I 've been going to age of light bars . Nothing that will be rewritten , /r/sweden .
( Wales och finns ) , it 's near a large group , with <unk> marriages , and I Det var som sport eller inte -unk- .
believe the entire world , but an age is a soulless <unk> 's disappearing circle
sweden
I like hans <unk> i <unk> i <unk> <unk> . I like Sanderson 's humor and work the podcast says that the police are
bringing the Islamic -unk- you wondered how we managed to laugh at them […]
I like mitt i <unk> om <unk> <unk> i <unk> andra <unk> man <unk> min du
inte i 'm ? I like strange Blue Sea though .
This is what happens when <unk> att <unk> <unk> de har <unk> <unk> man This is what happens when someone first Montreal this Way guys came out
inte och <unk> som <unk> , dom <unk> Situation vad <unk> och <unk> […] of their -unk- Sweden and McDonalds declined
This is what happens when det <unk> i <unk> <unk> . This is what happens when the fish comes to your world .
Figure 5: Sentences from Concat (left) and MCE (right) models, using the subreddit as context. Top panel
is using the subreddit Patriots while the bottom is sweden. Words highlighted in green indicate that more
emphasis was given to the context part of the model during generation.
the best performance boost over a backoff 5-gram generating each token of its translation. Likewise,
model; Sundermeyer et al. (2012) show better results we look to build a model that can pay attention to the
could be achieved with Long Short-Term Memory contexts based on the specific input word.
networks (LSTMs) (Hochreiter and Schmidhuber,
1997). Mikolov and Zweig (2012) improve perfor- We also draw inspiration from mixture-of-expert
mance with LDA-based pre-trained topic models of (MoE) models (Jacobs et al., 1991; Jordan and Ja-
the entire document as context to an RNN LM. Those cobs, 1994). MoE is an ensemble method of im-
representations are concatenated into both the input proving performance by blending together perfor-
and hidden layers, in an effort to achieve better docu- mances of different “experts.” In MoE models, a
ment perplexity. Sordoni et al. (2015) expand on this gating network learns to mix the predictions of vari-
work by setting the context vector to be the output ous experts. The gating network can be thought of as
of a feed forward neural network, allowing the entire analogous to the attention network from Bahdanau
model to be trained end-to-end. Other papers have et al. (2015). The original motivation of MoE in Ja-
since incorporated context in language models for a cobs et al. (1991) was to build a gating network that
variety of tasks, including conversation (Vinyals and would help experts segment the input space into local
Le, 2015), email suggestion (Kannan et al., 2016), pieces, such that each could learn decoupled represen-
machine translation (Bahdanau et al., 2015), and im- tations. In other words, the experts are given the same
age captioning (Chen and Zitnick, 2015). However, inputs, but they work competitively rather than co-
concatenating context is inflexible because it assumes operatively to learn different parts of the input space.
the effect of the context is static throughout the sen- One challenge in MoE is that same gradient flows
tence. Therefore, we draw from the attention mecha- back to every expert, albeit with different weights,
nism used in neural machine translation (Bahdanau leading to correlated experts and a less expressive
et al., 2015), in which part of a network learns to overall model. To avoid experts learning correlated
place more importance on certain input tokens when responses, Jacobs et al. (1991) stochastically choose
a single expert to update. This approach is similarly
applied in Shazeer et al. (2017) as a way to lower modules for language modeling, such as QRNNs, and
computation costs in massively large networks. in other variations such as an unbalanced allocation
Similar to how our approach feeds each expert of hidden layers for each expert.
a different input vector, Garmash and Monz (2016)
also feed different inputs to each expert. For the
problem of Neural Machine Translation, they train References
a network end-to-end with two experts that take in Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio.
different inputs (French or German language) but 2015. Neural machine translation by jointly learning to
have the same target output: the English translation. align and translate. In Proceedings of ICLR.
The gating network learns to interpolate the two mod- Jerome R Bellegarda. 2004. Statistical language model
els in one combined model. Because the inputs are adaptation: review and perspectives. Speech Communi-
different, the representations are inherently different, cation, 42(1):93 – 108. Adaptation Methods for Speech
Recognition.
and there’s no notion of the experts converging to
similar representations. This approach works well James Bradbury, Stephen Merity, Caiming Xiong, and
Richard Socher. 2017. Quasi-recurrent neural net-
but requires a trilingual dataset where multiple inputs
works. In Proceedings of ICLR.
map to the same output.
Ciprian Chelba, Mohammad Norouzi, and Samy Ben-
gio. 2017. N-gram language modeling using re-
7 Discussion current neural network estimation. arXiv preprint
arXiv:1703.10724.
In this paper, we present a novel method of incorpo- Xinlei Chen and Lawrence C. Zitnick. 2015. Mind’s eye:
rating context information in language models, based A recurrent visual representation for image caption
on the idea of mixture-of-experts. By allocating each generation. In Proceedings of CVPR.
context to an expert, our MCE model can dynam- Jan K Chorowski, Dzmitry Bahdanau, Dmitriy Serdyuk,
ically attend to different contexts as needed. This Kyunghyun Cho, and Yoshua Bengio. 2015. Attention-
flexibility allows our model to capture richer interac- based models for speech recognition. In Proceedings
tions of input word, context identity or identities, and of NIPS.
input word sequence history. This advantage largely Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and
powers our MCE’s consistent perplexity gains over Yoshua Bengio. 2014. Empirical evaluation of gated
recurrent neural networks on sequence modeling. In
the concatenation model baselines. In addition, MCE
Proceeedings of NIPS Deep Learning Workshop.
linearly mixes the predictions of experts for each
Cristian Danescu-Niculescu-Mizil, Robert West, Dan Ju-
word, which allows us to analyze its behavior. Our rafsky, Jure Leskovec, and Christopher Potts. 2013. No
analyses reveal that the model exhibits desirable prop- country for old members: User lifecycle and linguis-
erties, such as increased influence of the context at tic change in online communities. In Proceedings of
the start of sentences and when predicting content WWW.
words such as nouns. Jeffrey L. Elman. 1990. Finding structure in time. Cogni-
The flexibility of the MCE framework opens up tive science, 14(2):179–211.
multiple possible avenues for future research. In Ekaterina Garmash and Christof Monz. 2016. Ensemble
particular, the value of the contextless background learning for multi-source neural machine translation.
model to the scheme should be investigated - there In Proceedings of COLING.
exists clear interpretive value in a background model, Sepp Hochreiter and Jürgen Schmidhuber. 1997. Long
short-term memory. Neural computation, 9(8):1735–
but allocating its hidden units to the contextual mod-
1780.
els may yield better results. In addition, in this study,
Yu-Yang Huang, Rui Yan, Tsung-Ting Kuo, and Shou-De
context is limited to community and user identity, but
Lin. 2014. Enriching cold start personalized language
adding other forms of context is of interest. For in- model using social network information. In Proceed-
stance, MCE could benefit conversation modeling by ings of ACL.
including an expert that generates predictions based Robert A Jacobs, Michael I Jordan, Steven J Nowlan, and
specifically on conversational history, in addition to a Geoffrey E Hinton. 1991. Adaptive mixtures of local
user expert. We are also interested in using different experts. Neural computation, 3(1):79–87.
Aaron Jaech, Victoria Zayats, Hao Fang, Mari Ostendorf, Alessandro Sordoni, Michel Galley, Michael Auli, Chris
and Hannaneh Hajishirzi. 2015. Talking to the crowd: Brockett, Yangfeng Ji, Margaret Mitchell, Jian-Yun
What do people react to in online discussions? In Nie, Jianfeng Gao, and Bill Dolan. 2015. A neural
Proceedings of EMNLP. network approach to context-sensitive generation of
Michael I Jordan and Robert A Jacobs. 1994. Hierarchi- conversational responses. In Proceedings of NAACL
cal mixtures of experts and the em algorithm. Neural HLT.
computation, 6(2):181–214. Andreas Stolcke. 2002. Srilm – an extensible language
Rafal Jozefowicz, Oriol Vinyals, Mike Schuster, Noam modeling toolkit. In Proceedings of INTERSPEECH.
Shazeer, and Yonghui Wu. 2016. Exploring Martin Sundermeyer, Ralf Schlüter, and Hermann Ney.
the limits of language modeling. arXiv preprint 2012. Lstm neural networks for language modeling. In
arXiv:1602.02410. Proceedings of INTERSPEECH.
Anjuli Kannan, Karol Kurach, Sujith Ravi, Tobias Kauf- Chenhao Tan and Lillian Lee. 2015. All who wander: On
mann, Andrew Tomkins, Balint Miklos, Greg Corrado, the prevalence and characteristics of multi-community
László Lukács, Marina Ganea, Peter Young, et al. 2016. engagement. In Proceedings of WWW.
Smart reply: Automated response suggestion for email. Trang Tran and Mari Ostendorf. 2016. Characterizing
In Proceedings of ACM SIGKDD. the language of online communities and its relation to
Diederik P. Kingma and Jimmy Ba. 2015. Adam: A community reception. In EMNLP.
method for stochastic optimization. In Proceedings of Oriol Vinyals and Quoc Le. 2015. A neural conversational
ICLR. model. In Proceedings of ICLR.
Hung-Yi Lee, Bo-Hsiang Tseng, Tsung-Hsien Wen, and Tian Wang and Kyunghyun Cho. 2016. Larger-context
Yu Tsao. 2017. Personalizing recurrent-neural- language modelling with recurrent neural network. In
network-based language model by social network. In Proceedings of ACL.
Proceedings of TASLP. Tsung-Hsien Wen, Milica Gasic, Nikola Mrkšić, Pei-Hao
Jiwei Li, Michel Galley, Chris Brockett, Georgios Sp- Su, David Vandyke, and Steve Young. 2015. Seman-
ithourakis, Jianfeng Gao, and Bill Dolan. 2016. A tically conditioned lstm-based natural language gener-
persona-based neural conversation model. In Proceed- ation for spoken dialogue systems. In Proceedings of
ings of ACL. EMNLP.
Gábor Melis, Chris Dyer, and Phil Blunsom. 2018. On Gui-Rong Xue, Jie Han, Yong Yu, and Qiang Yang. 2009.
the state of the art of evaluation in neural language User language model for collaborative personalized
models. In Proceedings of ICLR. search. In Proceedings of TOIS.
Stephen Merity, Nitish Shirish Keskar, and Richard Socher. Seunghyun Yoon, Hyeongu Yun, Yuna Kim, Gyu-tae
2018. Regularizing and optimizing LSTM language Park, and Kyomin Jung. 2017. Efficient transfer
models. In Proceedings of ICLR. learning schemes for personalized language model-
Tomas Mikolov and Geoffrey Zweig. 2012. Context ing using recurrent neural network. arXiv preprint
dependent recurrent neural network language model. arXiv:1701.03578.
In Proceedings of SLT.
Tomas Mikolov, Martin Karafiát, Lukás Burget, Jan Cer-
nocký, and Sanjeev Khudanpur. 2010. Recurrent neu-
ral network based language model. In Proceedings of
INTERSPEECH.
Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado,
and Jeffrey Dean. 2013. Distributed representations
of words and phrases and their compositionality. In
Proceedings of NIPS.
Iulian V. Serban, Alessandro Sordoni, Yoshua Bengio,
Aaron Courville, and Joelle Pineau. 2016. Building
end-to-end dialogue systems using generative hierarchi-
cal neural network models. In Proceedings of AAAI.
Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz,
Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff
Dean. 2017. Outrageously large neural networks: The
sparsely-gated mixture-of-experts layer. In Proceed-
ings of ICLR.