########################################################################
#
#       License: BSD
#       Created: June 02, 2004
#       Author:  Francesc Altet - faltet@carabos.com
#
#       $Id: IndexArray.py 1496 2006-03-13 09:48:13Z faltet $
#
########################################################################

"""Here is defined the IndexArray class.

See IndexArray class docstring for more info.

Classes:

    IndexArray

Functions:


Misc variables:

    __version__


"""

import warnings
import sys

import numarray
import numarray.strings as strings
import numarray.records as records

import tables.hdf5Extension as hdf5Extension
from tables.Atom import Atom, StringAtom
from tables.EArray import EArray


__version__ = "$Revision: 1496 $"

# default version for IndexARRAY objects
obversion = "1.0"    # initial version

# The minimum row number in a column that can be indexed in tests
minRowIndex = 10


def calcChunksize(expectedrows, testmode=False):
    """Calculate the HDF5 chunk size for index and sorted arrays.

    The logic to do that is based purely in experiments playing with
    different chunksizes and compression flag. It is obvious that
    using big chunks optimize the I/O speed, but if they are too
    large, the uncompressor takes too much time. This might (should)
    be further optimized doing more experiments.

    """

    if testmode:
        if expectedrows < minRowIndex*10:
            nelemslice = minRowIndex
            chunksize = minRowIndex
        elif expectedrows < minRowIndex*100:
            nelemslice = 100
            chunksize = 50
        elif expectedrows <= minRowIndex*1000:
            nelemslice = 1000
            chunksize = 600
        else:
            raise ValueError, \
                  "expected rows cannot be larger than %s in test mode" % minRowIndex*1000
        #print "nelemslice, chunksize:", (nelemslice, chunksize)
        return (nelemslice, chunksize)

    expKrows = expectedrows / 1000000.  # Multiples of one million

    # expKrows < 0.01 is to few for indexing to represent a significant gain
    # (that has been checked experimentally)
#     if expKrows < 0.01: # expected rows < 10 thousand
#         nelemslice = 1000  # > 1/100th
#         chunksize = 1000
    if expKrows < 0.1: # expected rows < 100 thousand
        nelemslice = 5000  # (best experimental)
        chunksize = 1000
#         nelemslice = 5*1024
#         chunksize = 1024
    elif expKrows < 1: # expected rows < 1 milion
        nelemslice = 20000  # (best experimental)
        chunksize = 2000   # (best experimental)
#         nelemslice = 5*1024
#         chunksize = 1024
#         chunksize = 2048  # (best experimental)
#         nelemslice = 10*chunksize   # (best experimental)
       #chunksize = 2048
       #nelemslice = 10*chunksize
    elif expKrows < 10:  # expected rows < 10 milion
        #nelemslice = 500000  # > 1/20th (best for best case)
        #chunksize = 5000  # Experimental (best for best case)
        nelemslice = 100000 # Best for worst case (experimental)
        chunksize = 2000  # best for worst case (experimental)
        #chunksize = 4096  # (best experimental)
        #nelemslice = 10*chunksize   # (best experimental)
#         nelemslice = 20*4096
#         chunksize = 4096
    elif expKrows < 100: # expected rows < 100 milions
        nelemslice = 100000
        chunksize = 5000
#         nelemslice = 20*4096
#         chunksize = 4096
    elif expKrows < 1000: # expected rows < 1000 millions
        nelemslice = 200000 # Experimental (best)
        #chunksize = 10000   # Experimental (best)
        chunksize = 5000
    else:  # expected rows > 1 billion
        #nelemslice = 1000000 # 1/1000  # Better for small machines
        #nelemslice = 2000000 # 2/1000  # Better for big machines
        nelemslice = 500000
        #chunksize = 5000
        chunksize = 10000  # Experimental

    #print "nelemslice, chunksize:", (nelemslice, chunksize)
    return (nelemslice, chunksize)


class IndexArray(hdf5Extension.IndexArray, EArray):

    """Represent the index (sorted or reverse index) dataset in HDF5 file.

    All Numeric and numarray typecodes are supported except for complex
    datatypes.

    Methods:

      Common to all EArray's:
        read(start, stop, step)
        iterrows(start, stop, step)
        append(object)


    Instance variables:

      Common to all EArray's:

        type -- The type class for the array.
        itemsize -- The size of the atomic items. Specially useful for
            CharArrays.
        flavor -- The flavor of this object.
        nrow -- On iterators, this is the index of the row currently
            dealed with.

      Specific of IndexArray:
        extdim -- The enlargeable dimension (always the first, or 0).
        nrows -- The number of slices in index.
        nelemslice -- The number of elements per slice.
        chunksize -- The HDF5 chunksize for the slice dimension (the second).

    """

    _c_classId = "INDEXARRAY"


    def __init__(self, parentNode, name,
                 atom=None, title="",
                 filters=None,
                 testmode=False,
                 expectedrows=0):
        """Create an IndexArray instance.

        Keyword arguments:

        atom -- An Atom object representing the shape, type and flavor
            of the atomic objects to be saved. Only scalar atoms are
            supported.

        title -- Sets a TITLE attribute on the array entity.

        filters -- An instance of the Filters class that provides
            information about the desired I/O filters to be applied
            during the life of this object.

        expectedrows -- Represents an user estimate about the number
            of elements to index.

        """

        self.testmode = testmode
        """Enables test mode for index chunk size calculation."""
        self.nelemslice = None
        """The number of elements per slice."""
        self.chunksize = None
        """The HDF5 chunksize for the slice dimension (the second)."""

        # Compute the optimum number of slices and chunk sizes
        # for newly created index arrays.
        if atom is not None:
            (self.nelemslice, self.chunksize) = (
                calcChunksize(expectedrows, testmode))

        # Index creation is never logged.
        super(IndexArray, self).__init__(
            parentNode, name, atom, title, filters, expectedrows, log=False)


    def _g_create(self):
        assert self.atom.shape == (0, 1), "only scalar columns can be indexed"
        objectId = super(IndexArray, self)._g_create()
        assert self.extdim == 0, "computed extendable dimension is wrong"
        assert self.shape == (0, self.nelemslice), "invalid shape"
        assert self._v_chunksize == (1, self.chunksize), "invalid chunk size"
        return objectId


    def _calcTuplesAndChunks(self, atom, extdim, expectedrows, compress):
        return (0, (1, self.chunksize))  # (_v_maxTuples, _v_chunksize)


    def _createEArray(self, title):
        # The shape of the index array needs to be fixed before creating it.
        self.shape = (0, self.nelemslice)
        super(IndexArray, self)._createEArray(title)


    def _g_postInitHook(self):
        # Set ``nelemslice`` and ``chunksize`` when opening an existing node;
        # otherwise, they are already set.
        if not self._v_new:
            self.nelemslice = self.shape[1]
            self.chunksize = self._v_chunksize[1]
        super(IndexArray, self)._g_postInitHook()


    def append(self, arr):
        """Append the object to this (enlargeable) object"""
        arr.shape = (1, arr.shape[0])
        self._append(arr)


    # This is coded in pyrex as well, but the improvement in speed is very
    # little. So, it's better to let _searchBin live here.
    def _searchBin(self, nrow, item):
        nelemslice = self.shape[1]
        hi = nelemslice
        item1, item2 = item
        item1done = 0; item2done = 0
        chunksize = self.chunksize # Number of elements/chunksize
        niter = 1

        # First, look at the beginning of the slice (that could save lots of time)
        buffer = self._readSortedSlice(nrow, 0, chunksize)
        #buffer = xrange(0, chunksize)  # test  # 0.02 over 0.5 seg
        # Look for items at the beginning of sorted slices
        result1 = self._bisect_left(buffer, item1, chunksize)
        if 0 <= result1 < chunksize:
            item1done = 1
        result2 = self._bisect_right(buffer, item2, chunksize)
        if 0 <= result2 < chunksize:
            item2done = 1
        if item1done and item2done:
            return (result1, result2, niter)

        # Then, look for items at the end of the sorted slice
        buffer = self._readSortedSlice(nrow, hi-chunksize, hi)
        #buffer = xrange(hi-chunksize, hi)  # test
        niter += 1
        if not item1done:
            result1 = self._bisect_left(buffer, item1, chunksize)
            if 0 < result1 <= chunksize:
                item1done = 1
                result1 = hi - chunksize + result1
        if not item2done:
            result2 = self._bisect_right(buffer, item2, chunksize)
            if 0 < result2 <= chunksize:
                item2done = 1
                result2 = hi - chunksize + result2
        if item1done and item2done:
            return (result1, result2, niter)

        # Finally, do a lookup for item1 and item2 if they were not found
        # Lookup in the middle of slice for item1
        if not item1done:
            lo = 0
            hi = nelemslice
            beginning = 1
            result1 = 1  # a number different from 0
            while beginning and result1 != 0:
                (result1, beginning, iter) = \
                          self._interSearch_left(nrow, chunksize,
                                                 item1, lo, hi)
                tmpresult1 = result1
                niter = niter + iter
                if result1 == hi:  # The item is completely at right
                    break
                else:
                    hi = result1        # one chunk to the left
                    lo = hi - chunksize
            result1 = tmpresult1
        # Lookup in the middle of slice for item1
        if not item2done:
            lo = 0
            hi = nelemslice
            ending = 1
            result2 = 1  # a number different from 0
            while ending and result2 != nelemslice:
                (result2, ending, iter) = \
                          self._interSearch_right(nrow, chunksize,
                                                  item2, lo, hi)
                tmpresult2 = result2
                niter = niter + iter
                if result2 == lo:  # The item is completely at left
                    break
                else:
                    hi = result2 + chunksize      # one chunk to the right
                    lo = result2
            result2 = tmpresult2
            niter = niter + iter
        return (result1, result2, niter)


    def __str__(self):
        "A compact representation of this class"
        return "IndexArray(path=%s)" % self._v_pathname


    def __repr__(self):
        """A verbose representation of this class"""

        return """%s
  atom = %r
  nrows = %s
  nelemslice = %s
  chunksize = %s
  byteorder = %r""" % (self, self.atom, self.nrows, self.nelemslice,
                       self.chunksize, self.byteorder)
