001 /*
002 * Smawk.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.alignment;
033
034
035 /**
036 * This class implement the SMAWK algorithm to compute column maxima on a totally monotone
037 * matrix as described.
038 *
039 * <P>This implementation derives from the paper of A.Aggarwal, M.Klawe, S.Moran, P.Shor,
040 * and R.Wilber, <I>Geometric Applications of a Matrix Searching Algorithm</I>,
041 * Algorithmica, 2, 195-208 (1987).</P>
042 *
043 * <P>The matrix must be an object that implements the {@linkplain Matrix} interface. It
044 * is also expected to be totally monotone, and the number of rows should be greater than
045 * or equals to the number of columns. If these conditions are not met, the the result is
046 * unpredictable and can lead to an ArrayIndexOutOfBoundsException.</P>
047 *
048 * <P>{@link #computeColumnMaxima computeColumnMaxima} is the main public method of this
049 * class. It computes the column maxima of a given matrix, i.e. the rows that contain the
050 * maximum value of each column in O(n) (linear) time, where n is the number of rows. This
051 * method does not return the maximum values itself, but just the indexes of their
052 * rows.</P>
053 *
054 * <P>Note that it is necessary to create an instance of this class to execute the
055 * <CODE>computeColumnMaxima</CODE> because it stores temporary data is that instance. To
056 * prevent problems with concurrent access, the <CODE>computeColumnMaxima</CODE> method is
057 * declared <CODE>synchronized</CODE>.
058 *
059 * <CODE><BLOCKQUOTE><PRE>
060 * // create an instance of Smawk
061 * Smawk smawk = new Smawk();
062 *
063 * // create an array to store the result
064 * int col_maxima = new int [some_matrix.numColumns()];
065 *
066 * // now compute column maxima
067 * smawk.computeColumnMaxima (some_matrix, col_maxima)
068 * </PRE></BLOCKQUOTE></CODE>
069 *
070 * <P>Note that the array of column maxima indexes (the computation result) must be
071 * created beforehand and its size must be equal to the number of columns of the
072 * matrix.</P>
073 *
074 * <P>This implementation creates arrays of row and column indexes from the original array
075 * and simulates all operations (reducing, deletion of odd columns, etc.) by manipulating
076 * these arrays. The benefit is two-fold. First the matrix is not required to implement
077 * any of these this operations but only a simple method to retrieve a value at a given
078 * position. Moreover, it tends to be faster since it uses a manipulation of these small
079 * vectors and no row or column is actually deleted. The downside is, of course, the use
080 * of extra memory (in practice, however, this is negligible).</P>
081 *
082 * <P>Note that this class does not contain a <CODE>computeRowMaxima</CODE> method,
083 * however, the <CODE>computeColumnMaxima</CODE> can easily be used to compute row maxima
084 * by using a transposed matrix interface, i.e. one that inverts the indexes of the
085 * <CODE>valueAt</CODE> method (returning [col,row] when [row,col] is requested) and swaps
086 * the number of rows by the number of columns, and vice-versa.</P>
087 *
088 * <P>Another simpler method, {@link #naiveComputeColumnMaxima naiveComputeColumnMaxima},
089 * does the same job without using the SMAWK algorithm. It takes advantage of the monotone
090 * property of the matrix only (SMAWK explores the stronger constraint of total
091 * monotonicity), and therefore has a worst case time complexity of O(n * m), where n is
092 * the number of rows and m is the number of columns. However, this method tends to be
093 * faster for small matrices because it avoids recursions and row and column
094 * manipulations. There is also a
095 * {@linkplain #naiveComputeRowMaxima naiveComputeRowMaxima} method to compute row maxima
096 * with the naive approach.</P>
097 *
098 * @author Sergio A. de Carvalho Jr.
099 * @see Matrix
100 */
101 public class Smawk
102 {
103 /**
104 * A pointer to the matrix that is being manipulated.
105 */
106 protected Matrix matrix;
107
108 /**
109 * The matrix's current number of rows. This reflects any deletion of rows already
110 * performed.
111 */
112 protected int numrows;
113
114 /**
115 * An array of row indexes reflecting the current state of the matrix. When rows are
116 * deleted, the corresponding indexes are simply moved to the end of the vector.
117 */
118 protected int row[];
119
120 /**
121 * This array is used to store for each row of the original matrix, its index in the
122 * current state of the matrix, i.e. its index in the <CODE>row</CODE> array.
123 */
124 protected int row_position[];
125
126 /**
127 * The matrix's current number of columns. This reflects any deletion of columns
128 * already performed.
129 */
130 protected int numcols;
131
132 /**
133 * An array of column indexes reflecting the current state of the matrix. When columns
134 * are deleted, the corresponding indexes are simply moved to the end of the vector.
135 */
136 protected int col[];
137
138 /**
139 * Computes the column maxima of a given matrix. It first sets up arrays of row and
140 * column indexes to simulate a copy of the matrix (where all operations will be
141 * performed). It then calls the recursive protected <CODE>computeColumnMaxima</CODE>
142 * method.
143 *
144 * <P>The matrix is required to be an object that implements the <CODE>Matrix</CODE>
145 * interface. It is also expected to be totally monotone, and the number of rows
146 * should be greater than or equals to the number of columns. If it is not, the the
147 * result is unpredictable and can lead to an ArrayIndexOutOfBoundsException.</P>
148 *
149 * <P>This method does not return the maximum values itself, but just the indexes of
150 * their rows. Note that the array of column maxima (the computation result) must be
151 * created beforehand and its size must be equal to the number of columns of the
152 * matrix.</P>
153 *
154 * <P>To prevent problems with concurrent access, this method is declared
155 * <CODE>synchronized</CODE>.</P>
156 *
157 * @param matrix the matrix that will have its column maxima computed
158 * @param col_maxima the array of column maxima (indexes of the rows containing
159 * maximum values of each column); this is the computation result
160 * @see #computeColumnMaxima(int[])
161 */
162 public synchronized void computeColumnMaxima (Matrix matrix, int[] col_maxima)
163 {
164 int i;
165
166 this.matrix = matrix;
167
168 // create an array of column indexes
169 numcols = matrix.numColumns();
170 col = new int [numcols];
171 for (i = 0; i < numcols; i++)
172 col[i] = i;
173
174 // create an array of row indexes
175 numrows = matrix.numRows();
176 row = new int [numrows];
177 for (i = 0; i < numrows; i++)
178 row[i] = i;
179
180 // instantiate an helper array for
181 // backward reference of rows
182 row_position = new int [numrows];
183
184 computeColumnMaxima (col_maxima);
185 }
186
187 /**
188 * This method implements the SMAWK algorithm to compute the column maxima of a given
189 * matrix. It uses the arrays of row and column indexes to performs all operations on
190 * this 'fake' copy of the original matrix.
191 *
192 * <P>The first step is to reduce the matrix to a quadratic size (if necessary). It
193 * then delete all odd columns and recursively computes column maxima for this matrix.
194 * Finally, using the information computed for the odd columns, it searches for
195 * column maxima of the even columns. The column maxima are progressively stored in
196 * the <CODE>col_maxima</CODE> array (each recursive call will compute a set of
197 * column maxima).</P>
198 *
199 * @param col_maxima the array of column maxima (the computation result)
200 */
201 protected void computeColumnMaxima (int[] col_maxima)
202 {
203 int original_numrows, original_numcols, c, r, max, end;
204
205 original_numrows = numrows;
206
207 if (numrows > numcols)
208 {
209 // reduce to a quadratic size by deleting
210 // rows that contain no maximum of any column
211 reduce ();
212 }
213
214 // base case: matrix has only one row (and one column)
215 if (numrows == 1)
216 {
217 // so the first column's maximum is the only row left
218 col_maxima[col[0]] = row[0];
219
220 if (original_numrows > numrows)
221 {
222 // restore rows of original matrix (deleted on reduction)
223 restoreRows (original_numrows);
224 }
225
226 return;
227 }
228
229 // save the number of columns before deleting the odd ones
230 original_numcols = numcols;
231
232 deleteOddColumns ();
233
234 // recursively computes max rows for the remaining even columns
235 computeColumnMaxima (col_maxima);
236
237 restoreOddColumns (original_numcols);
238
239 // set up pointers to the original index for all rows
240 for (r = 0; r < numrows; r++)
241 row_position[row[r]] = r;
242
243 // compute max rows for odd columns based on the result of even columns
244 for (c = 1; c < numcols; c = c + 2)
245 {
246 if (c < numcols - 1)
247 // if not last column, search ends
248 // at next columns' max row
249 end = row_position[col_maxima[col[c + 1]]];
250 else
251 // if last columnm, search ends
252 // at last row
253 end = numrows - 1;
254
255 // search starts at previous columns' max row
256 max = row_position[col_maxima[col[c - 1]]];
257
258 // check all values until the end
259 for (r = max + 1; r <= end; r++)
260 {
261 if (valueAt(r, c) > valueAt(max, c))
262 max = r;
263 }
264
265 col_maxima[col[c]] = row[max];
266 }
267
268 if (original_numrows > numrows)
269 {
270 // restore rows of original matrix (deleted on reduction)
271 restoreRows (original_numrows);
272 }
273 }
274
275 /**
276 * This is a helper method to simplify the call to the <CODE>valueAt</CODE> method
277 * of the matrix. It returns the value at row <CODE>r</CODE>, column <CODE>c</CODE>.
278 *
279 * @param r the row number of the value being retrieved
280 * @param c the column number of the value being retrieved
281 * @return the value at row <CODE>r</CODE>, column <CODE>c</CODE>
282 * @see Matrix#valueAt
283 */
284 protected final int valueAt (int r, int c)
285 {
286 return matrix.valueAt (row[r], col[c]);
287 }
288
289 /**
290 * This method simulates a deletion of odd rows by manipulating the <CODE>col</CODE>
291 * array of indexes. In fact, nothing is deleted, but the indexes are moved to the end
292 * of the array in a way that they can be easily restored by the
293 * <CODE>restoreOddColumns</CODE> method using a reverse approach.
294 *
295 * @see #restoreOddColumns
296 */
297 protected void deleteOddColumns ()
298 {
299 int tmp;
300
301 for (int c = 2; c < numcols; c = c + 2)
302 {
303 // swap column c with c/2
304 tmp = col[c / 2];
305 col[c / 2] = col[c];
306 col[c] = tmp;
307 }
308
309 numcols = ((numcols - 1) / 2 + 1);
310 }
311
312 /**
313 * Restores the <CODE>col</CODE> array of indexes to the state it was before the
314 * <CODE>deleteOddColumns</CODE> method was called. It only needs to know how many
315 * columns there was originally. The indexes that were moved to the end of the array
316 * are restored to their original position.
317 *
318 * @param original_numcols the number of columns before the odd ones were deleted
319 * @see #deleteOddColumns
320 */
321 protected void restoreOddColumns (int original_numcols)
322 {
323 int tmp;
324
325 for (int c = 2 * ((original_numcols - 1) / 2); c > 0; c = c - 2)
326 {
327 // swap back column c with c/2
328 tmp = col[c / 2];
329 col[c / 2] = col[c];
330 col[c] = tmp;
331 }
332
333 numcols = original_numcols;
334 }
335
336 /**
337 * This method is the key component of the SMAWK algorithm. It reduces an n x m matrix
338 * (n rows and m columns), where n >= m, to an n x n matrix by deleting m - n rows
339 * that are guaranteed to have no maximum value for any column. The result is an
340 * squared submatrix matrix that contains, for each column c, the row that has the
341 * maximum value of c in the original matrix. The rows are deleted with the
342 * <CODE>deleteRow</CODE>method.
343 *
344 * <P>It uses the total monotonicity property of the matrix to identify which rows can
345 * safely be deleted.
346 *
347 * @see #deleteRow
348 */
349 protected void reduce ()
350 {
351 int k = 0, reduced_numrows = numrows;
352
353 // until there is more rows than columns
354 while (reduced_numrows > numcols)
355 {
356 if (valueAt(k, k) < valueAt(k + 1, k))
357 {
358 // delete row k
359 deleteRow (reduced_numrows, k);
360 reduced_numrows --;
361 k --;
362 }
363 else
364 {
365 if (k < numcols - 1)
366 {
367 k++;
368 }
369 else
370 {
371 // delete row k+1
372 deleteRow (reduced_numrows, k+1);
373 reduced_numrows --;
374 }
375 }
376 }
377
378 numrows = reduced_numrows;
379 }
380
381 /**
382 * This method simulates a deletion of a row in the matrix during the
383 * <CODE>reduce</CODE> operation. It just moves the index to the end of the array in a
384 * way that it can be restored afterwards by the <CODE>restoreRows</CODE> method
385 * (nothing is actually deleted). Deleted indexes are kept in ascending order.
386 *
387 * @param reduced_rows the current number of rows in the reducing matrix
388 * @param k the index of the row to be deleted
389 * @see #restoreRows
390 */
391 protected void deleteRow (int reduced_rows, int k)
392 {
393 int r, saved_row = row[k];
394
395 for (r = k + 1; r < reduced_rows; r++)
396 row[r - 1] = row[r];
397
398 for (r = reduced_rows - 1; r < (numrows - 1) && row[r+1] < saved_row; r++)
399 row[r] = row[r+1];
400
401 row[r] = saved_row;
402 }
403
404 /**
405 * Restores the <CODE>row</CODE> array of indexes to the state it was before the
406 * <CODE>reduce</CODE> method was called. It only needs to know how many rows there
407 * was originally. The indexes that were moved to the end of the array are restored to
408 * their original position.
409 *
410 * @param original_numrows the number of rows before the reduction was performed
411 * @see #deleteRow
412 * @see #reduce
413 */
414 protected void restoreRows (int original_numrows)
415 {
416 int r, r2, s, d = numrows;
417
418 for (r = 0; r < d; r++)
419 {
420 if (row[r] > row[d])
421 {
422 s = row[d];
423 for (r2 = d; r2 > r; r2--)
424 row[r2] = row[r2-1];
425 row[r] = s;
426 d++;
427 if (d > original_numrows - 1) break;
428 }
429 }
430
431 numrows = original_numrows;
432 }
433
434 /**
435 * This is a simpler method for calculating column maxima. It does the same job as
436 * <CODE>computeColumnMaxima</CODE>, but without complexity of the SMAWK algorithm.
437 *
438 * <P>The matrix is required to be an object that implements the <CODE>Matrix</CODE>
439 * interface. It is also expected to be monotone. If it is not, the result is
440 * unpredictable but, unlike <CODE>computeColumnMaxima</CODE>, it cannot lead to an
441 * ArrayIndexOutOfBoundsException.</P>
442 *
443 * <P>This method does not return the maximum values itself, but just the indexes of
444 * their rows. Note that the array of column maxima (the computation result) must be
445 * created beforehand and its size must be equal to the number of columns of the
446 * matrix.</P>
447 *
448 * <P>It takes advantage of the monotone property of the matrix only (SMAWK explores
449 * the stronger constraint of total monotonicity), and therefore has a worst case time
450 * complexity of O(n * m), where n is the number of rows and m is the number of
451 * columns. However, this method tends to be faster for small matrices because it
452 * avoids recursions and row and column manipulations.</P>
453 *
454 * @param matrix the matrix that will have its column maxima computed
455 * @param col_maxima the array of column maxima (indexes of the rows containing
456 * maximum values of each column); this is the computation result
457 * @see #naiveComputeRowMaxima
458 */
459 public static void naiveComputeColumnMaxima (Matrix matrix, int col_maxima[])
460 {
461 int max_row = 0;
462 //int last_max = 0;
463
464 for (int c = 0; c < matrix.numColumns(); c ++)
465 {
466 for (int r = max_row; r < matrix.numRows(); r++)
467 if (matrix.valueAt(r,c) > matrix.valueAt(max_row,c))
468 max_row = r;
469
470 col_maxima[c] = max_row;
471
472 // uncomment the following code to raise an exception when
473 // the matrix is not monotone
474 /*
475 if (max_row < last_max)
476 throw new IllegalArgumentException ("Non totally monotone matrix.");
477 last_max = max_row;
478 max_row = 0;
479 */
480 }
481 }
482
483 /**
484 * This is a simpler method for calculating row maxima. It does not use the SMAWK
485 * algorithm.
486 *
487 * <P>The matrix is required to be an object that implements the <CODE>Matrix</CODE>
488 * interface. It is also expected to be monotone. If it is not, the result is
489 * unpredictable but, unlike <CODE>computeColumnMaxima</CODE>, it cannot lead to an
490 * ArrayIndexOutOfBoundsException.</P>
491 *
492 * <P>This method does not return the maximum values itself, but just the indexes of
493 * their columns. Note that the array of row maxima (the computation result) must be
494 * created beforehand and its size must be equal to the number of columns of the
495 * matrix.</P>
496 *
497 * <P>It takes advantage of the monotone property of the matrix only (SMAWK explores
498 * the stronger constraint of total monotonicity), and therefore has a worst case time
499 * complexity of O(n * m), where n is the number of rows and m is the number of
500 * columns. However, this method tends to be faster for small matrices because it
501 * avoids recursions and row and column manipulations.</P>
502 *
503 * @param matrix the matrix that will have its row maxima computed
504 * @param row_maxima the array of row maxima (indexes of the columns containing
505 * maximum values of each row); this is the computation result
506 * @see #naiveComputeColumnMaxima
507 */
508 public static void naiveComputeRowMaxima (Matrix matrix, int row_maxima[])
509 {
510 int max_col = 0;
511 //int last_max = 0;
512
513 for (int r = 0; r < matrix.numRows(); r++)
514 {
515 for (int c = max_col; c < matrix.numColumns(); c ++)
516 if (matrix.valueAt(r,c) > matrix.valueAt(r,max_col))
517 max_col = c;
518
519 row_maxima[r] = max_col;
520
521 // uncomment the following code to raise an exception when
522 // the matrix is not monotone
523 /*
524 if (max_col < last_max)
525 throw new IllegalArgumentException ("Non-monotone matrix.");
526 last_max = max_col;
527 max_col = 0;
528 */
529 }
530 }
531
532 /**
533 * Prints the current state of the matrix (reflecting deleted rows and columns) in the
534 * standard output. It can be used internally for debugging.
535 */
536 protected void printMatrix ()
537 {
538 int r, c;
539
540 System.out.print("row\\col\t| ");
541 for (c = 0; c < numcols; c++)
542 System.out.print(col[c] + "\t");
543
544 for (r = 0; r < numrows; r++)
545 {
546 System.out.print(row[r] + "\n\t| ");
547 for (c = 0; c < numcols; c++)
548 System.out.print(matrix.valueAt(r,c) + "\t");
549 }
550
551 }
552
553 /**
554 * Prints the contents of an object implementing the matrix interface in the standard
555 * output. It can be used for debugging.
556 *
557 * @param matrix a matrix
558 */
559 public static void printMatrix (Matrix matrix)
560 {
561 for (int r = 0; r < matrix.numRows(); r++)
562 {
563 for (int c = 0; c < matrix.numColumns(); c++)
564 System.out.print(matrix.valueAt(r,c) + "\t");
565 System.out.print("\n");
566 }
567 }
568 }