001 /*
002 * FactorSequence.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.alignment;
033
034 import java.io.Reader;
035 import java.io.BufferedReader;
036 import java.io.IOException;
037
038 /**
039 * This class builds a list of factors of a character sequence as induced by its
040 * Lempel-Ziv (LZ78) factorisation. Each factor is enconded as the longest factor
041 * previously seen plus one character.
042 *
043 * <P>The input can come from any source, provided it is encapsulated in a proper
044 * <CODE>Reader</CODE> instance. The stream is expected to be ready (i.e. the next
045 * <CODE>read</CODE> operation must return the first character of the sequence) and it is
046 * not closed when its end is reached, so the client is allowed to reset it and maybe use
047 * it for another purpose.</P>
048 *
049 * <P>Sequences can contain letters only although lines started with the
050 * <CODE>COMMENT_CHAR</CODE> character ('>') are regarded as comments and are completely
051 * skipped. White spaces (including tabs, line feeds and carriage returns) are also
052 * ignored throughout.</P>
053 *
054 * <P>This class uses a {@linkplain Trie} to keep track of a list of factors. Each node of
055 * the trie contains a {@linkplain Factor} of the text. As the sequence is read from the
056 * input, the trie is traversed as far as possible. When a leaf node is reached (which
057 * means that the longest prefix of the input has been found), two tasks are
058 * accomplished:</P>
059 *
060 * <UL>
061 * <LI>a new <CODE>Factor</CODE> is created with the character at the current position of
062 * the input and the leaf node's factor;
063 * <LI>a new node is added to the trie with the character at the current position of the
064 * input;
065 * </UL>
066 *
067 * <P>Each factor also receives a serial number according to the order they are found and
068 * a pointer to the next factor (in that order) for fast access. This pointer, together
069 * with the factor's ancestor pointer forms a doubly-linked list of factors. The original
070 * text can then be reconstructed simply by following the linked list and writing out its
071 * factors.</P>
072 *
073 * <P>As an example, the sequence <CODE>ACTAAACCGCATTAATAATAAAA</CODE> is parsed into the
074 * following 12 factors:</P>
075 *
076 * <CODE><BLOCKQUOTE><PRE>
077 * 0 ( , ) = empty
078 * 1 (0,A) = A
079 * 2 (0,C) = C
080 * 3 (0,T) = T
081 * 4 (1,A) = AA
082 * 5 (1,C) = AC
083 * 6 (2,G) = CG
084 * 7 (2,A) = CA
085 * 8 (3,T) = TT
086 * 9 (4,T) = AAT
087 * 10 (9,A) = AATA
088 * 11 (4,A) = AAA
089 *
090 * serial # (prefix, new char) = factor text
091 * </PRE></BLOCKQUOTE></CODE>
092 *
093 * <P>This class is used by {@linkplain CrochemoreLandauZivUkelson} algorithm to speed up
094 * the classic dynamic programming approach to sequence alignment.</P>
095 *
096 * @author Sergio A. de Carvalho Jr.
097 * @see Factor
098 * @see Trie
099 * @see CrochemoreLandauZivUkelson
100 */
101 public class FactorSequence
102 {
103 /**
104 * The character used to start a comment line in a sequence file. When this character
105 * is found, the rest of the line is ignored.
106 */
107 protected static final char COMMENT_CHAR = '>';
108
109 /**
110 * A pointer to the root factor, the one that starts the list of factors.
111 */
112 protected Factor root_factor;
113
114 /**
115 * The numbers of character represented by this sequence.
116 */
117 protected int num_chars;
118
119 /**
120 * The numbers of factors generated by the LZ78 parsing of the sequence.
121 */
122 protected int num_factors;
123
124 /**
125 * Creates a new instance of a <CODE>FactorSequence</CODE>, loading the sequence data
126 * from the <CODE>Reader</CODE> input stream. A doubly-linked list of factors is built
127 * according to its LZ78 factorisation.
128 *
129 * @param reader source of characters for this sequence
130 * @throws IOException if an I/O exception occurs when reading the input
131 * @throws InvalidSequenceException if the input does not contain a valid sequence
132 */
133 public FactorSequence (Reader reader)
134 throws IOException, InvalidSequenceException
135 {
136 BufferedReader input = new BufferedReader(reader);
137 Trie root_node, current_node, new_node = null;
138 Factor current_factor, last_factor, new_factor;
139 int ch;
140 char c;
141
142 // create root factor and the root node of the trie
143 root_factor = new Factor ();
144 root_node = new Trie (root_factor);
145 num_factors = 1;
146 num_chars = 0;
147
148 current_node = root_node;
149 last_factor = root_factor;
150
151 // read characters from the input
152 while ((ch = input.read()) != -1)
153 {
154 c = (char) ch;
155
156 if (c == COMMENT_CHAR)
157 // it's a comment line: skip it!
158 input.readLine();
159
160 // accept letters only
161 else if (Character.isLetter(c))
162 {
163 num_chars++;
164
165 // walk down the trie as far as possible
166 new_node = current_node.spellDown(c);
167
168 if (new_node != null)
169 {
170 current_node = new_node;
171 }
172 else
173 {
174 // the longest factor of the input has been found,
175 // now create a new factor from the current node's factor
176 current_factor = (Factor) current_node.getData();
177 new_factor = new Factor (current_factor, num_factors, c);
178
179 // add the new character to the trie as well
180 current_node.add (new_factor, c);
181
182 // set up a pointer from the last factor to the new one
183 last_factor.setNext (new_factor);
184 last_factor = new_factor;
185
186 // restart at the root of the trie
187 current_node = root_node;
188
189 num_factors++;
190 }
191 }
192
193 // anything else, except whitespaces, will throw an exception
194 else if (!Character.isWhitespace(c))
195 throw new InvalidSequenceException
196 ("Sequences can contain letters only.");
197 }
198
199 // if new_node is not null, the last factor is actually
200 // not a new factor but a factor already created
201 if (new_node != null)
202 {
203 // no new node is created, just point the last_factor to an
204 // existing one that represents the last characters of the text
205 last_factor.setNext((Factor) new_node.getData());
206
207 num_factors++;
208 }
209
210 // check if read anything useful!
211 if (num_factors <= 1)
212 throw new InvalidSequenceException ("Empty sequence.");
213 }
214
215 /**
216 * Returns the root factor, the one that starts the list of factors.
217 *
218 * @return root factor
219 */
220 public Factor getRootFactor ()
221 {
222 return root_factor;
223 }
224
225 /**
226 * Returns the number of factors produced by the LZ78 parsing of the text.
227 *
228 * @return number of factors
229 */
230 public int numFactors()
231 {
232 return num_factors;
233 }
234
235 /**
236 * Returns the number of characters of the original sequence.
237 *
238 * @return number of characters of the original sequence
239 */
240 public int numChars ()
241 {
242 return num_chars;
243 }
244
245 /**
246 * Reconstructs the sequence from the list of factors induced by the LZ78 parsing of
247 * the text.
248 *
249 * @return the original sequence
250 */
251 public String toString ()
252 {
253 StringBuffer buf = new StringBuffer();
254 Factor node;
255
256 node = root_factor.getNext();
257
258 for (int i = 1; i < numFactors(); i++)
259 {
260 buf.append(node);
261
262 node = node.getNext();
263 }
264
265 return buf.toString();
266 }
267
268 /**
269 * Returns a string representation of the actual list of factors produced by the LZ78
270 * parsing of the text. Each factor is printed out in a separate line, in the order
271 * they appear in the text, with its serial number, its ancestor's serial number, its
272 * new character, length and a string representation of the factor itself.
273 *
274 * @return a string representation of the list of factors
275 */
276 public String printFactors ()
277 {
278 StringBuffer buf = new StringBuffer();
279 Factor factor;
280
281 factor = root_factor.getNext();
282
283 for (int i = 1; i < numFactors(); i++)
284 {
285 buf.append (factor.getSerialNumber() + "\t<");
286 buf.append (factor.getAncestor().getSerialNumber() + " ,\t");
287 buf.append (factor.getNewChar() + ">\t");
288 buf.append (factor.length() + "\t" + factor + "\n");
289
290 factor = factor.getNext();
291 }
292
293 buf.append(numFactors() + " factors\n");
294
295 return buf.toString();
296 }
297 }