CodeSwitchTransitionModel.java

package edu.berkeley.cs.nlp.ocular.model.transition;

import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeAddTildeMap;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeCanBeElidedSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeCanBeReplacedSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeDiacriticDisregardMap;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makePunctSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeValidDoublableSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeValidSubstitutionCharsSet;
import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeSet;
import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import tberg.murphy.arrays.a;
import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType;
import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
import edu.berkeley.cs.nlp.ocular.lm.SingleLanguageModel;
import edu.berkeley.cs.nlp.ocular.model.TransitionStateType;
import edu.berkeley.cs.nlp.ocular.util.ArrayHelper;
import edu.berkeley.cs.nlp.ocular.util.Tuple2;
import tberg.murphy.indexer.Indexer;

/**
 * @author Dan Garrette (dhgarrette@gmail.com)
 */
public class CodeSwitchTransitionModel implements SparseTransitionModel {

	public class CodeSwitchTransitionState implements TransitionState {
		private final int[] context;
		public final TransitionStateType type;

		/**
		 * The current language of this state.  This may be *-1* to indicate that there is no
		 * current language state.  This will happen at, for example, the beginning of a document,
		 * where forcing a language decision before we have reached a word makes no sense.  The 
		 * null should be used to tell the system to use the language *prior* instead of the language
		 * *transition* prior.  In other words:
		 * 
		 *     p(destLang | null) = p(destLang)
		 */
		public final int langIndex;

		public final int lmCharIndex;
		public final GlyphChar glyphChar;

		public CodeSwitchTransitionState(int[] context, TransitionStateType type, int langIndex, GlyphChar glyphChar) {
			if (context == null) throw new IllegalArgumentException("context is null");
			if (glyphChar == null) throw new IllegalArgumentException("glyphChar is null");

			this.context = context;
			this.type = type;
			this.langIndex = langIndex;
			this.lmCharIndex = makeLmCharIndex(context, type);
			this.glyphChar = glyphChar;
		}

		public boolean equals(Object other) {
			if (other instanceof CodeSwitchTransitionState) {
				CodeSwitchTransitionState that = (CodeSwitchTransitionState) other;
				if (this.type != that.type || this.langIndex != that.langIndex) {
					return false;
				}
				else if (!Arrays.equals(this.context, that.context)) {
					return false;
				}
				else if (!this.glyphChar.equals(that.glyphChar)) {
					return false;
				}
				else {
					return true;
				}
			}
			else {
				return false;
			}
		}

		public int hashCode() {
			int ctxHash = Arrays.hashCode(context);
			int typeHash = this.type.ordinal();
			int langHash = this.langIndex;
			int glyphHash = this.glyphChar.hashCode();
			return 1013 * ctxHash + 1009 * typeHash + 1007 * langHash + 1017 * glyphHash;
		}

		private void addNoSubGlyphStates(List<Tuple2<TransitionState, Double>> result, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
			int nextLmChar = makeLmCharIndex(nextContext, nextType);
			addNoSubGlyphStates(result, nextLmChar, nextContext, nextType, nextLanguage, transitionScore);
		}
		
		private void addNoSubGlyphStates(List<Tuple2<TransitionState, Double>> result, int nextLmChar, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
			if (!allowGlyphSubstitution)
				addState(result, nextContext, nextType, nextLanguage, new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR), transitionScore);
			else {
				GlyphType glyphType = glyphChar.glyphType;
				
				if (nextType == TransitionStateType.RMRGN_HPHN_INIT || nextType == TransitionStateType.RMRGN_HPHN|| nextType == TransitionStateType.LMRGN_HPHN) {
					/*
					 * This always maintains whether it is marked as a tilde-elision character 
					 * or an elided character.  This is necessary right-margin-hyphen states 
					 * in which the new state is detached from the actual previous character.
					 * Note that non-hyphen margins should just use no-sub glyph since normal
					 * (non-hyphen) margins are treated as spaces, and spaces can't be elided
					 * and can't follow tilde-elision states.
					 */
					{
						GlyphChar nextGlyphChar = new GlyphChar(nextLmChar, glyphChar.glyphType);
						double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
						addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
					}
					
					if (nextType == TransitionStateType.RMRGN_HPHN_INIT) {
						/*
						 * Allow for the elision of Ouptut a space 
						 */
						GlyphChar nextGlyphChar = new GlyphChar(spaceCharIndex, glyphChar.glyphType);
						double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
						addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
					}
				}
				else {
					/*
					 * 1. Next state's glyph is just the rendering of the LM character
					 * 
					 * This is just a short-circuit of `addGlyphStates` in which no 
					 * substitution glyph states are permitted.  Useful for things
					 * like punctuation or spaces, where substitutions will never
					 * be allowed.
					 */
					if (glyphType != GlyphType.ELISION_TILDE) { // normal state can't follow an elision-marking tilde
						// 1. Next state's glyph is just the rendering of the LM character
						GlyphChar nextGlyphChar = new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR);
						double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
						addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
					}
				}
			}
		}

		/**
		 * Add transition states, allowing for the possibility of substitutions or elisions.
		 * 
		 *   1. Next state's glyph is just the rendering of the LM character
		 *   2. Next state's glyph is a substitution of the LM character
		 *   3. Next state's glyph is an elision-decorated version of the LM character
		 *   4. Next state's glyph is an elision after a tilde-decorated character
		 *   5. Next state's glyph is the LM char, stripped of its accents
		 *   6. Next state's glyph is an elision after a space
		 *   7. Next state's glyph is a doubled version of the LM character
		 * 
		 */
		private void addGlyphStates(List<Tuple2<TransitionState, Double>> result, int nextLmChar, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
            /*if (charIndexer.getObject(nextLmChar).equals("m")) {
                System.out.println("Context of " + charIndexer.getObject(nextLmChar) + ":\n");
                for (int c : nextContext) {
                    System.out.println(charIndexer.getObject(c));
                }
                System.out.println(transitionScore);
                if (nextContext.length > 1){
                    System.out.println(charIndexer.getObject(nextContext[nextContext.length-2]));
                    System.out.println(charIndexer.getObject(nextContext[nextContext.length-1]));
                }
                System.out.println("Done\n");
            }
            if (charIndexer.getObject(nextLmChar).equals("m") && charIndexer.getObject(context[context.length-1]).equals("u") && charIndexer.getObject(context[context.length-2]).equals("r")) {
                System.out.println("Hello\n");
            }*/
			if (!allowGlyphSubstitution)
				addState(result, nextContext, nextType, nextLanguage, new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR), transitionScore);
			else {
				Set<GlyphChar> potentialNextGlyphChars = new HashSet<GlyphChar>(); 
				GlyphType glyphType = glyphChar.glyphType;
				if (glyphType == GlyphType.DOUBLED) {
					// Deterministically duplicate the glyph (but no longer marked as "doubled")
					//potentialNextGlyphChars.add(new GlyphChar(glyphChar.templateCharIndex, GlyphType.NORMAL_CHAR));
					throw new RuntimeException("This should have been handled elsewhere so that we don't re-include ngram LM scores");
				}
				else if (glyphType == GlyphType.ELISION_TILDE) {
					// 4. An elision-tilde'd character must be followed by a tilde-elision
					if (canBeElided.contains(nextLmChar)) {
						potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.TILDE_ELIDED));
					}
				}
				else {
					// 1. Next state's glyph is just the rendering of the LM character
					potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR));

					// 2. Next state's glyph is a substitution of the LM character
					if (canBeReplaced.contains(nextLmChar)) {
						for (int nextGlyphCharIndex : lm.get(nextLanguage).getActiveCharacters()) {
							if (validSubstitutionChars.contains(nextGlyphCharIndex)) {
								potentialNextGlyphChars.add(new GlyphChar(nextGlyphCharIndex, GlyphType.NORMAL_CHAR));
							}
						}
					}
					if (nextLmChar == sCharIndex)
						potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR));
					
					// 3. Next state's glyph is an elision-decorated version of the LM character
					Integer tildeDecorated = addTilde.get(nextLmChar);
					if (tildeDecorated != null) {
						potentialNextGlyphChars.add(new GlyphChar(tildeDecorated, GlyphType.ELISION_TILDE));
					}
	
					// 4. Next state's glyph is elided --- No elision can take place after a normal character 
					if (glyphType == GlyphType.TILDE_ELIDED) {
						if (canBeElided.contains(nextLmChar)) {
							potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.TILDE_ELIDED));
						}
					}
					
					// 5. Next state's glyph is the LM char, stripped of its accents
					Integer baseChar = diacriticDisregardMap.get(nextLmChar);
					if (baseChar != null) {
						potentialNextGlyphChars.add(new GlyphChar(baseChar, GlyphType.NORMAL_CHAR));
					}
					
					// 6. Next state's glyph is an elision after a space
					if (!elideAnything) {
					if (glyphType != GlyphType.FIRST_ELIDED) { // TODO: Comment this out if we want to allow multiple characters to be elided from the front of a word
						if (lmCharIndex == spaceCharIndex) {
							if (type != TransitionStateType.LMRGN_HPHN && type != TransitionStateType.RMRGN_HPHN_INIT && type != TransitionStateType.RMRGN_HPHN) { // only allowed at the start of a word, not in the middle of a hyphenated word
								if (nextType == TransitionStateType.TMPL) {
									if (canBeElided.contains(nextLmChar)) {
										potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.FIRST_ELIDED));
									}
								}
							}
						}
					}
					}
					
					// 7. Next state's glyph is a doubled version of the LM character
					if (validDoublableSet.contains(nextLmChar)) {
						potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.DOUBLED));
						if (nextLmChar == sCharIndex)
							potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.DOUBLED));
					}
					
					// 8. Elide the character
					if (elideAnything) {
						if (nextType == TransitionStateType.TMPL) {
							if (canBeElided.contains(nextLmChar)) {
								potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.ELIDED));
							}
						}
					}

				}
				
				// Create states for all the potential next glyphs
				double p = 0;
				for (GlyphChar nextGlyphChar : potentialNextGlyphChars) {
					double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
                    if (context.length > 0 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("u") && charIndexer.getObject(context[context.length-1]).equals("r")) {
                        glyphLogProb = -p;
                    } else if (context.length > 1 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("m") && charIndexer.getObject(context[context.length-1]).equals("u") && charIndexer.getObject(context[context.length-2]).equals("r")) {
                        glyphLogProb = -p;
                    }

                    else if (context.length > 0 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("e") && charIndexer.getObject(context[context.length-1]).equals("p")) {
                        glyphLogProb = -p;
                    } else if (context.length > 1 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("r") && charIndexer.getObject(context[context.length-1]).equals("e") && charIndexer.getObject(context[context.length-2]).equals("p")) {
                        glyphLogProb = -p;
                    }

                    else if (context.length > 0 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("t") && charIndexer.getObject(context[context.length-1]).equals("e")) {
                        glyphLogProb = -p;
                    }

                    else if (context.length > 0 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("o") && charIndexer.getObject(context[context.length-1]).equals("c")) {
                        glyphLogProb = -p;
                    } else if (context.length > 1 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("n") && charIndexer.getObject(context[context.length-1]).equals("o") && charIndexer.getObject(context[context.length-2]).equals("c")) {
                        glyphLogProb = -p;
                    }

                    else if (context.length > 0 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("i") && charIndexer.getObject(context[context.length-1]).equals("l")) {
                        glyphLogProb = -p;
                    } else if (context.length > 1 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("s") && charIndexer.getObject(context[context.length-1]).equals("i") && charIndexer.getObject(context[context.length-2]).equals("l")) {
                        glyphLogProb = -p;
                    }

                    else if (context.length > 0 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("r") && charIndexer.getObject(context[context.length-1]).equals("p")) {
                        glyphLogProb = -p;
                    } else if (context.length > 1 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("o") && charIndexer.getObject(context[context.length-1]).equals("r") && charIndexer.getObject(context[context.length-2]).equals("p")) {
                        glyphLogProb = -p;
                    }

                    else if (context.length > 0 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("u") && charIndexer.getObject(context[context.length-1]).equals("q")) {
                        glyphLogProb = -p;
                    } else if (context.length > 1 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("e") && charIndexer.getObject(context[context.length-1]).equals("u") && charIndexer.getObject(context[context.length-2]).equals("q")) {
                        glyphLogProb = -p;
                    }

                    else if (context.length > 0 && nextGlyphChar.templateCharIndex == spaceCharIndex && charIndexer.getObject(nextLmChar).equals("s") && charIndexer.getObject(context[context.length-1]).equals("u")) {
                        glyphLogProb = -p;
                    }
					addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
				}
			}
		}
		
		private void addTransitionsToTmpl(List<Tuple2<TransitionState, Double>> result, int[] context) {
			addTransitionsToTmpl(result, context, 0.0, false);
		}

		private void addTransitionsToTmpl(List<Tuple2<TransitionState, Double>> result, int[] context, double prevScore, boolean clearContext) {
			if (glyphChar.glyphType == GlyphType.DOUBLED) {
				// Duplicate the state: same context, language, lmChar, ...; but Doubled=>Normal
				TransitionStateType nextType = TransitionStateType.TMPL;
				int nextLanguage = langIndex;
				int nextLmChar = lmCharIndex;
				//SingleLanguageModel destLM = lm.get(nextLanguage);
				//double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
				double score = prevScore; //+ Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(destLM, context, nextLmChar)) + Math.log(pDestLang); // TODO: Is it necessary to have some sort of LM probability factored in?
				if (nextLmChar == sCharIndex) { // a doubled 's' may have long-s chars
					GlyphChar nextGlyphCharS = new GlyphChar(sCharIndex, GlyphType.NORMAL_CHAR);
					double glyphLogProbS = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharS);
					addState(result, context, nextType, nextLanguage, nextGlyphCharS, score + glyphLogProbS);

					GlyphChar nextGlyphCharLongs = new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR);
					double glyphLogProbLongs = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharLongs);
					addState(result, context, nextType, nextLanguage, nextGlyphCharLongs, score + glyphLogProbLongs);
				}
				else {
					GlyphChar nextGlyphChar = new GlyphChar(lmCharIndex, GlyphType.NORMAL_CHAR);
					double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
					addState(result, context, nextType, nextLanguage, nextGlyphChar, score + glyphLogProb);
				}
			}
			else {
				if (this.langIndex < 0) { // there is no current language
					for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) { // no current language, can switch to any language
						SingleLanguageModel destLM = lm.get(destLanguage);
						for (int c : destLM.getActiveCharacters()) { // punctuation no problem since we have no current language
							if (c != spaceCharIndex) {
								double pDestLang = lm.languagePrior(destLanguage); // no language to transition from
								double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
								int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
								addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
							}
						}
					}
				}
				else { // there is a current language
					boolean switchAllowed = lmCharIndex == spaceCharIndex; // can switch if its (a non-space character) after a space
					if (switchAllowed) { // switch permitted
						for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) {
							SingleLanguageModel destLM = lm.get(destLanguage);
							for (int c : destLM.getActiveCharacters()) {
								if (punctSet.contains(c)) {
									if (allowLanguageSwitchOnPunct) {
										double pDestLang = lm.languageTransitionProb(this.langIndex, destLanguage);
										double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
										int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
										addNoSubGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
									}
									else if (this.langIndex == destLanguage) { // switching not allowed, but this is the same language
										double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
										double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
										int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
										addNoSubGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
									}
								}
								else if (c != spaceCharIndex) {
									double pDestLang = lm.languageTransitionProb(this.langIndex, destLanguage);
									double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
									int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
									addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
								}
							}
						}
					}
					else { // no switching allowed
						int destLanguage = this.langIndex; // there will always be a current language here
						SingleLanguageModel destLM = lm.get(destLanguage);
						for (int c : destLM.getActiveCharacters()) { // punctuation no problem since we're definitely not switching anyway
							if (c != spaceCharIndex) {
								double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
								double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
								int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
								addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
							}
						}
					}
				}
	
				{ // space character: switching is never allowed
					SingleLanguageModel thisLM = lm.get(this.langIndex);
					// TODO: If current lmCharIndex==spaceCharIndex, sum over all languages?
					double pTransition = 0.0;
					//				if (lmCharIndex == spaceCharIndex) {
					double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
					pTransition += getNgramProb(thisLM, context, spaceCharIndex) * pDestLang;
					//				}
					//				else {
					//					// total probability of transitioning to a space, regardless of language
					//					for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) {
					//						SingleLanguageModel destLM = lm.get(destLanguage);
					//						double pDestLang = lm.languageTransitionPrior(this.langIndex, destLanguage);
					//						int[] shrunkenContext = shrinkContext(context, thisLM);
					//						pTransition += getNgramProb(thisLM, context, spaceCharIndex) * pDestLang;
					//					}
					//				}
					double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(pTransition);
					int[] nextContext = (!clearContext ? a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), spaceCharIndex) : new int[] { spaceCharIndex });
					addNoSubGlyphStates(result, spaceCharIndex, nextContext, TransitionStateType.TMPL, this.langIndex, score);
				}
			}
		}

		public Collection<Tuple2<TransitionState, Double>> nextLineStartStates() {
			SingleLanguageModel thisLM = lm.get(this.langIndex);
			List<Tuple2<TransitionState, Double>> result = new ArrayList<Tuple2<TransitionState, Double>>();

			if (type == TransitionStateType.TMPL) {
				// transition from letter to space (left margin)
				double scoreWithSpace = Math.log(getNgramProb(thisLM, context, spaceCharIndex));
				int[] contextWithSpace = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), spaceCharIndex);

				{
					double score = Math.log(LINE_MRGN_PROB) + scoreWithSpace;
					addNoSubGlyphStates(result, spaceCharIndex, contextWithSpace, TransitionStateType.LMRGN, this.langIndex, score);
				}

				addTransitionsToTmpl(result, contextWithSpace, scoreWithSpace, false);
			}
			else if (type == TransitionStateType.RMRGN) {
				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN, this.langIndex, score);
				}

				addTransitionsToTmpl(result, context);
			}
			else if (type == TransitionStateType.RMRGN_HPHN || type == TransitionStateType.RMRGN_HPHN_INIT) {
				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN_HPHN, this.langIndex, score);
				}

				if (this.langIndex >= 0) { // can't have a hyphen if there is no language, since that means there have been no characters so far
					if (glyphChar.glyphType == GlyphType.DOUBLED) {
						// Duplicate the state: same context, language, lmChar, ...; but Doubled=>Normal
						TransitionStateType nextType = TransitionStateType.TMPL;
						int nextLanguage = langIndex;
						int nextLmChar = lmCharIndex;
						double score = Math.log(1.0); //+ Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, nextLmChar)) + Math.log(1.0); // TODO: Is it necessary to have some sort of LM probability factored in?
						if (nextLmChar == sCharIndex) { // a doubled 's' may have long-s chars
							GlyphChar nextGlyphCharS = new GlyphChar(sCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProbS = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharS);
							addState(result, context, nextType, nextLanguage, nextGlyphCharS, score + glyphLogProbS);

							GlyphChar nextGlyphCharLongs = new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProbLongs = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharLongs);
							addState(result, context, nextType, nextLanguage, nextGlyphCharLongs, score + glyphLogProbLongs);
						}
						else {
							GlyphChar nextGlyphChar = new GlyphChar(lmCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
							addState(result, context, nextType, nextLanguage, nextGlyphChar, score + glyphLogProb);
						}
					}
					else {
						for (int c : thisLM.getActiveCharacters()) {
							if (c != spaceCharIndex && !punctSet.contains(c)) { // can't start a line after hyphen with space or punct 
								double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, c)) /*+ Math.log(1.0)*/;
								int[] nextContext = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), c);
								addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, this.langIndex, score);
							}
						}
					}
				}
			}
			else if (type == TransitionStateType.LMRGN || type == TransitionStateType.LMRGN_HPHN) {
				// TODO: TAYLOR: Why do we clear the context in this case?

				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, new int[0], TransitionStateType.LMRGN, this.langIndex, score);
				}

				addTransitionsToTmpl(result, context, 0.0, true);
			}
			return result;
		}

		public double endLogProb() {
			if (glyphChar.glyphType == GlyphType.DOUBLED || glyphChar.glyphType == GlyphType.ELISION_TILDE) // can't end on an incomplete "double glyph"
				return Double.NEGATIVE_INFINITY;
			else
				return 0.0;
		}

		/**
		 * Calculate forward transitions
		 */
		public Collection<Tuple2<TransitionState, Double>> forwardTransitions() {
			SingleLanguageModel thisLM = lm.get(this.langIndex);
			List<Tuple2<TransitionState, Double>> result = new ArrayList<Tuple2<TransitionState, Double>>();

			if (type == TransitionStateType.LMRGN) {
				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN, this.langIndex, score);
				}

				addTransitionsToTmpl(result, context);
			}
			else if (type == TransitionStateType.LMRGN_HPHN) {
				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN_HPHN, this.langIndex, score);
				}

				if (this.langIndex >= 0) { // can't have a hyphen if there is no language, since that means there have been no characters so far
					if (glyphChar.glyphType == GlyphType.DOUBLED) {
						// Duplicate the state: same context, language, lmChar, ...; but Doubled=>Normal
						TransitionStateType nextType = TransitionStateType.TMPL;
						int nextLanguage = langIndex;
						int nextLmChar = lmCharIndex;
						//double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
						double score = Math.log(1.0); //+ Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, nextLmChar)) + Math.log(pDestLang); // TODO: Is it necessary to have some sort of LM probability factored in?
						if (nextLmChar == sCharIndex) { // a doubled 's' may have long-s chars
							GlyphChar nextGlyphCharS = new GlyphChar(sCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProbS = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharS);
							addState(result, context, nextType, nextLanguage, nextGlyphCharS, score + glyphLogProbS);

							GlyphChar nextGlyphCharLongs = new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProbLongs = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharLongs);
							addState(result, context, nextType, nextLanguage, nextGlyphCharLongs, score + glyphLogProbLongs);
						}
						else {
							GlyphChar nextGlyphChar = new GlyphChar(lmCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
							addState(result, context, nextType, nextLanguage, nextGlyphChar, score + glyphLogProb);
						}
					}
					else {
						for (int c : thisLM.getActiveCharacters()) {
							if (c != spaceCharIndex && !punctSet.contains(c)) { // can't start a line after hyphen with space or punct 
								double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
								double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, c)) + Math.log(pDestLang);
								int[] nextContext = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), c);
								addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, this.langIndex, score);
							}
						}
					}
				}
			}
			else if (type == TransitionStateType.RMRGN) {
				double score = Math.log(LINE_MRGN_PROB);
				addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN, this.langIndex, score);
			}
			else if (type == TransitionStateType.RMRGN_HPHN) {
				double score = Math.log(LINE_MRGN_PROB);
				addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN_HPHN, this.langIndex, score);
			}
			else if (type == TransitionStateType.RMRGN_HPHN_INIT) {
				double score = Math.log(LINE_MRGN_PROB);
				addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN_HPHN, this.langIndex, score);
			}
			else if (type == TransitionStateType.TMPL) {
				{
					double score = Math.log(LINE_MRGN_PROB) + Math.log(1.0 - LINE_END_HYPHEN_PROB) + Math.log(getNgramProb(thisLM, context, spaceCharIndex));
					int[] nextContext = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), spaceCharIndex);
					addNoSubGlyphStates(result, spaceCharIndex, nextContext, TransitionStateType.RMRGN, this.langIndex, score);
				}

				{
					double score = Math.log(LINE_MRGN_PROB) + Math.log(LINE_END_HYPHEN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN_HPHN_INIT, this.langIndex, score);
				}

				addTransitionsToTmpl(result, context);
			}
			return result;
		}

		public int getLmCharIndex() {
			return lmCharIndex;
		}
		
		public GlyphChar getGlyphChar() {
			return glyphChar;
		}

		public int getOffset() {
			throw new Error("Method not implemented");
		}

		public int getExposure() {
			throw new Error("Method not implemented");
		}

		public TransitionStateType getType() {
			return type;
		}

		public int getLanguageIndex() {
			return this.langIndex;
		}
		
		public String toString() {
			StringBuilder contextSB = new StringBuilder("[");
			for (int c : context)
				contextSB.append(charIndexer.getObject(c));
				//.append(", ");
			//if (context.length > 0) contextSB.delete(contextSB.length()-2, contextSB.length());
			contextSB.append("]");
			return "CodeSwitchTransitionState("+(langIndex>=0 ? langIndexer.getObject(langIndex) : "No Language")+", "+charIndexer.getObject(lmCharIndex)+", "+type+", "+contextSB+", "+glyphChar.toString(charIndexer)+")";
		}
	}

	private void addState(List<Tuple2<TransitionState, Double>> result, int[] stateContext, TransitionStateType stateType, int stateLanguage, GlyphChar glyphChar, double stateTransitionScore) {
		if (stateTransitionScore != Double.NEGATIVE_INFINITY) {
			result.add(Tuple2((TransitionState) new CodeSwitchTransitionState(stateContext, stateType, stateLanguage, glyphChar), stateTransitionScore));
		}
	}

	public static final double LINE_MRGN_PROB = 0.5;
	public static final double LINE_END_HYPHEN_PROB = 1e-8;

	private Indexer<String> charIndexer;
	private Indexer<String> langIndexer;
	private int spaceCharIndex;
	private int hyphenCharIndex;
	private int sCharIndex;
	private int longsCharIndex;
	private Set<Integer> punctSet;
	
	private Set<Integer> canBeReplaced;
	private Set<Integer> validSubstitutionChars;
	private Set<Integer> validDoublableSet;
	private Set<Integer> canBeElided;
	private Map<Integer, Integer> addTilde;
	private Map<Integer,Integer> diacriticDisregardMap;

	private int numLanguages;
	private CodeSwitchLanguageModel lm;
	private GlyphSubstitutionModel gsm;
	private boolean allowLanguageSwitchOnPunct;
	private boolean allowGlyphSubstitution;
	private double noCharSubPrior;
	private boolean elideAnything;

	private Set<TransitionStateType> alwaysSpaceTransitionTypes;
	
	/**
	 * character index is the last letter of the context.
	 * 
	 * if this is the beginning of a line (context is empty or the type
	 * is a margin), then charindex is a space. if it's a right margin,
	 * then last letter is a hyphen; if there is a context then you
	 * know, context.
	 */
	private int makeLmCharIndex(int[] context, TransitionStateType type) {
		if (context.length == 0 || this.alwaysSpaceTransitionTypes.contains(type)) {
			return spaceCharIndex;
		}
		else if (type == TransitionStateType.RMRGN_HPHN_INIT) {
			return hyphenCharIndex;
		}
		else {
			return context[context.length - 1];
		}
	}

	public CodeSwitchTransitionModel(CodeSwitchLanguageModel lm, boolean allowLanguageSwitchOnPunct, GlyphSubstitutionModel gsm, boolean allowGlyphSubstitution, double noCharSubPrior, boolean elideAnything) {
		this.lm = lm;
		this.gsm = gsm;
		this.allowLanguageSwitchOnPunct = allowLanguageSwitchOnPunct;
		this.allowGlyphSubstitution = allowGlyphSubstitution;
		this.noCharSubPrior = noCharSubPrior;
		this.elideAnything = elideAnything;

		this.charIndexer = lm.getCharacterIndexer();
		this.langIndexer = lm.getLanguageIndexer();
		this.spaceCharIndex = charIndexer.getIndex(Charset.SPACE);
		this.hyphenCharIndex = charIndexer.getIndex(Charset.HYPHEN);
		this.sCharIndex = charIndexer.contains("s") ? charIndexer.getIndex("s") : -1;
		this.longsCharIndex = charIndexer.getIndex(Charset.LONG_S);
		this.punctSet = makePunctSet(charIndexer);
		this.canBeReplaced = makeCanBeReplacedSet(charIndexer);
		this.validSubstitutionChars = makeValidSubstitutionCharsSet(charIndexer);
		this.validDoublableSet = makeValidDoublableSet(charIndexer);
		this.canBeElided = makeCanBeElidedSet(charIndexer);
		this.addTilde = makeAddTildeMap(charIndexer);
		this.diacriticDisregardMap = makeDiacriticDisregardMap(charIndexer);

		this.numLanguages = lm.getLanguageIndexer().size();
		this.alwaysSpaceTransitionTypes = makeSet(TransitionStateType.LMRGN, TransitionStateType.LMRGN_HPHN, TransitionStateType.RMRGN, TransitionStateType.RMRGN_HPHN);
	}

	private void addNoSubGlyphStartState(List<Tuple2<TransitionState, Double>> result, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
		if (!allowGlyphSubstitution)
			addState(result, nextContext, nextType, nextLanguage, new GlyphChar(spaceCharIndex, GlyphType.NORMAL_CHAR), transitionScore);
		else {
			// 1. Next state's glyph is just the rendering of the LM character
			GlyphChar nextGlyphChar = new GlyphChar(spaceCharIndex, GlyphType.NORMAL_CHAR);
			double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, spaceCharIndex, nextGlyphChar);
			addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
		}
	}

	/**
	 * Add transition states, allowing for the possibility of substitutions or elisions.
	 * 
	 *   1. Next state's glyph is just the rendering of the LM character
	 *   2. Next state's glyph is a substitution of the LM character
	 *   3. Next state's glyph is an elision-decorated version of the LM character
	 *   4. Next state's glyph is elided
	 *   5. Next state's glyph is the LM char, stripped of its accents
	 *   6. Next state's glyph is an elision after a space
	 *   7. Next state's glyph is a doubled version of the LM character
	 * 
	 */
	private void addGlyphStartStates(List<Tuple2<TransitionState, Double>> result, int nextLmChar, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
		if (!allowGlyphSubstitution)
			addState(result, nextContext, nextType, nextLanguage, new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR), transitionScore);
		else {
			Set<GlyphChar> potentialNextGlyphChars = new HashSet<GlyphChar>(); 
	
			// 1. Next state's glyph is just the rendering of the LM character
			potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR));
			
			// 2. Next state's glyph is a substitution of the LM character
			if (canBeReplaced.contains(nextLmChar)) {
				for (String c : lm.get(nextLanguage).getCharacterIndexer().getObjects()) {
                    int nextGlyphCharIndex = charIndexer.getIndex(c);
					if (validSubstitutionChars.contains(nextGlyphCharIndex)) {
						potentialNextGlyphChars.add(new GlyphChar(nextGlyphCharIndex, GlyphType.NORMAL_CHAR));
					}
				}
			}
			if (nextLmChar == sCharIndex)
				potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR));
			
			// 3. Next state's glyph is an elision-decorated version of the LM character
			Integer tildeDecorated = addTilde.get(nextLmChar);
			if (tildeDecorated != null) {
				potentialNextGlyphChars.add(new GlyphChar(tildeDecorated, GlyphType.ELISION_TILDE));
			}
			
			// 5. Next state's glyph is the LM char, stripped of its accents
			Integer baseChar = diacriticDisregardMap.get(nextLmChar);
			if (baseChar != null) {
				potentialNextGlyphChars.add(new GlyphChar(baseChar, GlyphType.NORMAL_CHAR));
			}

			// 6. Next state's glyph is an elision after a space --- and the start state is always a "space"
			if (!elideAnything) {
			if (nextType == TransitionStateType.TMPL) {
				if (canBeElided.contains(nextLmChar)) {
					potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.FIRST_ELIDED));
				}
			}
			}

			// 7. Next state's glyph is a doubled version of the LM character
			if (validDoublableSet.contains(nextLmChar)) {
				potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.DOUBLED));
				if (nextLmChar == sCharIndex)
					potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.DOUBLED));
			}
			
			// 8. Elide the character
			if (elideAnything) {
				if (nextType == TransitionStateType.TMPL) {
					if (canBeElided.contains(nextLmChar)) {
						potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.ELIDED));
					}
				}
			}

			// Create states for all the potential next glyphs
			
			/*if (nextLmChar == 14) {
                System.out.println("Start\n");
            }*/
			for (GlyphChar nextGlyphChar : potentialNextGlyphChars) {
				double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
                /*if (nextLmChar == 14) {
                    if (nextGlyphChar.templateCharIndex == spaceCharIndex) {
                        System.out.println("SPACE");
                        System.out.println(spaceCharIndex);
                    }
                    System.out.println(nextGlyphChar.templateCharIndex);
                    System.out.println(glyphLogProb);
                }*/
				addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
			}
			/*if (nextLmChar == 14) {
                System.out.println("Done\n");
            }*/
		}
	}

	/**
	 * Make a collection of states that can be the start of a line.
	 * 
	 * First possibility: L-Margin, with no context. Has probability LINE_MRGN_PROB * prior prob of the language. (1 of this) 
	 * Other possibilities: TMPL, with any individual single character c as context (~75 of these) 
	 *   - probability is: 1-LINE_MRGN_PROB * probability of c with no context * prior prob of the language.
	 */
	public Collection<Tuple2<TransitionState, Double>> startStates() {
		List<Tuple2<TransitionState, Double>> result = new ArrayList<Tuple2<TransitionState, Double>>();
		/*
		 * Don't force a language choice.
		 */
		{
			double score = Math.log(LINE_MRGN_PROB) /*+ Math.log(1.0)*/;
			addNoSubGlyphStartState(result, new int[0], TransitionStateType.LMRGN, -1, score);
		}
		/*
		 * Choose among all the languages when there's an actual word (not a space).
		 */
		for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) {
			SingleLanguageModel destLM = lm.get(destLanguage);
			double destLanguagePrior = lm.languagePrior(destLanguage);
			for (int c : destLM.getActiveCharacters()) {
				if (c != spaceCharIndex) {
					double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(destLM, new int[0], c)) + Math.log(destLanguagePrior);
					addGlyphStartStates(result, c, new int[] { c }, TransitionStateType.TMPL, destLanguage, score);
				}
			}
		}
		/*
		 * Since there's no "first" language, and we don't want to force a language 
		 * choice without an actual word, calculate the probability of starting the
		 * line with a space as the sum of the no-context space probabilities across
		 * all the languages, weighted by the language priors. 
		 */
		{
			double totalSpaceProb = 0.0;
			for (int language = 0; language < numLanguages; ++language)
				totalSpaceProb += getNgramProb(lm.get(language), new int[0], spaceCharIndex) * lm.languagePrior(language);
			double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(totalSpaceProb) /*+ Math.log(1.0)*/;
			addNoSubGlyphStartState(result, new int[] { spaceCharIndex }, TransitionStateType.TMPL, -1, score);
		}
		return result;
	}

	private double getNgramProb(SingleLanguageModel slm, int[] context, int c) {
		if (slm != null) {
			return slm.getCharNgramProb(shrinkContext(context, slm), c);
		}
		else {
			// No current language, so sum transition to `c` across all languages
			double totalSpaceProb = 0.0;
			for (int language = 0; language < numLanguages; ++language) {
				SingleLanguageModel languageLM = this.lm.get(language);
				totalSpaceProb += languageLM.getCharNgramProb(shrinkContext(context, languageLM), c) * this.lm.languagePrior(language);
			}
			return totalSpaceProb;
		}
	}

	//	private int[] appendToContext(int[] originalContext, int c, SingleLanguageModels lm) {
	//		return shrinkContext(a.append(originalContext, c), slm);
	//	}

	private double calculateGlyphLogProb(TransitionStateType nextType, int nextLanguage, int nextLmChar, GlyphChar nextGlyphChar) {
		if (nextLanguage < 0) {
			if (this.alwaysSpaceTransitionTypes.contains(nextType) && nextGlyphChar.templateCharIndex == spaceCharIndex)
				return 0.0; // log(1)
			else
				return Double.NEGATIVE_INFINITY; // log(0)
		}
		else {
			double p = (1.0 - noCharSubPrior) * gsm.glyphProb(nextLanguage, nextLmChar, nextGlyphChar);
			double pWithBias = ((nextGlyphChar.glyphType == GlyphType.NORMAL_CHAR && nextGlyphChar.templateCharIndex == nextLmChar) ? noCharSubPrior + p : p);
             
            /*if (nextLmChar == 14 && (nextGlyphChar.templateCharIndex == 0 || nextGlyphChar.templateCharIndex == 14)) { // m
                System.out.println(nextLmChar);
                System.out.println(nextGlyphChar.templateCharIndex);
                System.out.println(gsm.glyphProb(nextLanguage, nextLmChar, nextGlyphChar));
                System.out.println(p);
                System.out.println(pWithBias);
                System.out.println("\n");
            } */
			return Math.log(pWithBias);
		}
	}

	private int[] shrinkContext(int[] originalContext, SingleLanguageModel slm) {
		int[] newContext = originalContext;
		int maxOrder = slm.getMaxOrder();
		while (newContext.length > maxOrder - 1)
			newContext = ArrayHelper.takeRight(newContext, maxOrder - 1);
		if (slm != null) {
			newContext = slm.shrinkContext(newContext);
		}
		return newContext;
	}
}