classdef SatDataset < handle % subclass from handle to pass-by-reference
    % Class to represent a dataset
    %
    % Work in progress!
    %
    % $Id$
    
    % read/write props
    properties
        name;
        basedir;
        subdir;
        re;
        filename;
        reader = @(varargin)[];
        granule_duration;
        satname; % for single-satellite datasets
        dataset; % e.g. MYD06_L2 for modis
        firstline_filename = 'firstline_$NAME_$SPEC';

        % defaults cannot be set in any subclass, because re-defining a
        % property is not allowed except under special circumstances that
        % cannot be met here. Therefore, defaults should be encompassed in
        % a property, a structure, 'defaults', that is /not/ defined here,
        % but only by the class at the end of the hierarchy
    end
    
    % read-only props (alterable with methods)
    properties (SetAccess = protected)
        collocated_datasets = [];
    end
    
    methods
        
        %% constructor
        
        function self = SatDataset(varargin)
            % Creates SatDataset object
            %
            % properties can be set upon constructing as when calling
            % 'struct'
            
            % this may be defined in subclasses
            if isprop(self, 'defaults')
                def_fields = fieldnames(self.defaults); %#ok<*MCNPN>
                for i = 1:length(def_fields)
                    fldnm = def_fields{i};
                    self.(fldnm) = self.defaults.(fldnm);
                end
            end

            for i = 1:2:nargin
                self.(varargin{i}) = varargin{i+1};
            end
            
        end
        
        %% things that can be done with datasets
        
        function fulldir = find_datadir_by_date(self, date, varargin)
            % find_datadir_by_date Find directory containing granules
            %
            % For the given datevec and specification, return a string with
            % the path to the directory that contains the granules for this particular
            % datevec.
            %
            % FORMAT
            %
            %   fulldir = ds.find_datadir_by_date(datevec, spec)
            %
            % IN
            %
            %   datevec     vector      [year month day] etc.
            %   spec        any         name of sat or cellstr {sat1 sat2}
            %
            % OUT
            %
            %   fulldir     string      path to directory
            
            % verify basedir is defined and exists
            
            spec = optargs(varargin, {''});
            errid = ['atmlab:' mfilename]; %find_datadir_by_date';
            
            assert(any(self.basedir), errid, 'No basedir initialised for %s', char(self.name));
            assert(exist(self.basedir, 'dir')~=0, errid, ...
                ['Configured data directory for %s is %s, ' ...
                'but this does not exist or is not a directory. ' ...
                'Please define basedir for %s correctly ' ...
                'or create the directory.'], ...
                self.name, self.basedir, self.name);
            
            fulldir = fullfile(self.basedir, self.subdir);
            
            fulldir = strrep_variables(fulldir, spec, date);
   
        end
        
        function [M, paths] = find_granules_by_date(self, date, varargin)
            % find_granules_by_date Find all granule start times for date/sat/dataset
            %
            % FIXME DOC
            %
            % This function finds all granule start times for granules with any coverage
            % for the the indicated for the indicated satellite/dataset pair. This
            % includes the last granule of the date before, unless this is explicitly
            % prohibited. Note that the last granule of the date before may or may not
            % actually contain information for the day requested. The
            % function assumes at most one granule for the previous date
            % contains information for the current date.
            %
            % FORMAT
            %
            %   [M, paths] = find_granules_by_date(date, spec, with_yesterday)
            %
            % OUT
            %
            %   M     N x m matrix     where each row corresponds to one granule and is
            %                          a datevec.
            %   paths cell strings     Cell array of strings, corresponding full paths
            %
            % IN
            %
            %   date
            %   spec
            %   with_yesterday (optional)
            %
            % $Id$
            
            % errid = 'atmlab:find_granules_by_date';
            
            [spec, with_yesterday] = optargs(varargin, {'', true});
            
            datadir = self.find_datadir_by_date(date, spec);
            yearstr = num2str(date(1), '%04d'); % for comparing easily with 2digit years
            
            matchy = self.re;
            if isequalwithequalnans(matchy, nan) || isempty(matchy)
                matchy = self.filename; % exact match only
            end
            
            matchy = strrep_variables(matchy, spec, date);
            
            files = dir(datadir);
            nfiles = length(files);
            M = zeros(nfiles, 5);
            for i = 1:nfiles
                fname = files(i).name;
                nam = regexp(fname, matchy, 'names');
                if ~isempty(nam)
                    % if present, year/month/day should be the same
                    if (isfield(nam, 'year02') && ~isempty(nam.year02) && ~strcmp(nam.year02, yearstr(3:4))) || ...
                            (isfield(nam, 'year04') && ~isempty(nam.year04) && str2double(nam.year04)~=date(1)) || ...
                            (isfield(nam, 'year') && ~isempty(nam.year) && str2double(nam.year)~=date(1)) || ...
                            (isfield(nam, 'month') && ~isempty(nam.month) && str2double(nam.month)~=date(2)) || ...
                            (isfield(nam, 'day') && ~isempty(nam.day) && str2double(nam.day)~=date(3)) || ...
                            (isfield(nam, 'doy') && ~isempty(nam.doy) && str2double(nam.doy)~=dayofyear(date(1), date(2), date(3)));
                        continue;
                    end
                    
                    M(i, 1:3) = date(1:3);
                    if isfield(nam, 'hour')
                        M(i, 4) = str2double(nam.hour);
                    end
                    if isfield(nam, 'minute')
                        M(i, 5) = str2double(nam.minute);
                    end
                end
            end
            % all paths
            paths = cellfun(@(f) fullfile(datadir, f), {files.name}, 'UniformOutput', false);
            % remove lines with zeroes (those are not granules)
            nogran = M(:, 1)==0;
            M(nogran, :) = [];
            paths(nogran) = [];
            % sort
            [M, I] = sortrows(M);
            paths = paths(I);
            % add yesterday
            if with_yesterday
                yesterday = datevec(datenum(date)-1);
                [M_yesterday, paths_yesterday] = self.find_granules_by_date(yesterday, spec, false);
                if ~isempty(M_yesterday) % maybe today is genesis/big bang/epoch/birth/1970-01-01
                    M = [M_yesterday(end, :); M];
                    paths = [paths_yesterday(end) paths];
                end
            end
        end
        
        function datafile = find_granule_by_datetime(self, datevec, varargin)
        
            % find_granule_by_datetime Return full path of datafile by date/time
            %
            % Returns the full path of the datafile corresponding to the input
            % arguments. There are two modes:
            %
            %   - If dataset.filename is defined, the path is
            %     calculated directly from datasets_config settings and the arguments
            %     passed on. The file may or may not actually exist, so in this case,
            %     you can use it to calculate the path when planning to create it.
            %
            %   - If this is not defined, then dataset.re has to
            %     be defined. It will search the filesystem for a file matching the
            %     regular expression and the indicated datevec/spec for this dataset.
            %     In this case, one can apply a tolerance in seconds.
            %
            % FORMAT
            %
            %   datafile = dataset.find_granule_by_datetime(date, [spec[, tol]])
            %
            % IN
            %
            %   datevec     vector  Date/time to find datafile for
            %   spec        string/ Name of satellite. For datasets belonging to one
            %               cellstr satellite, a simple string. For datasets belonging
            %                       to two satellites (such as collocations), a cell
            %                       array of two strings.
            %   tol         number  Optional: tolerance in seconds. Due to
            %                       rounding errors, a date like 2009-12-06
            %                       20:53:00 passed to date2unixsecs and
            %                       back to unixsecs2date becomes
            %                       2009-12-06 20:52:59.99999, which is
            %                       hard to round in a straightforward way
            %                       (don't want ...52:60). By default, a
            %                       tolerance of 0.01 seconds is applied.
            %
            % OUT
            %
            %   datafile    string     Path to the looked-for datafile.
            
            errid = 'atmlab:find_granule_by_datetime';
            [spec, tol] = optargs(varargin, {'', 0.01});
            
            % implementation:
            % 1. find the directory containing the files from basedir/subdir
            % 2. if possible, calculate the filename directly
            % 3. otherwise, list all files and match with regexp+tolerance
            
            fulldir = self.find_datadir_by_date(datevec, spec);
            
            if ~isequalwithequalnans(self.filename, nan) && ~isempty(self.filename) % calculate directly
                fn = strrep_variables(self.filename, spec, datevec);
                datafile = fullfile(fulldir, fn);
            else % will search through all granules with find_granules_by_date
                [granules, paths] = self.find_granules_by_date(datevec, spec, false);
                if tol>0 % use tolerance
                    g = mat2cell(granules, size(granules, 1), [1 1 1 1 1]);
                    grans_unisecs = date2unixsecs(g{:});
                    X = num2cell(datevec);
                    dv_unisecs = date2unixsecs(X{:});
                    found = abs(grans_unisecs - dv_unisecs) < tol;
                else
                    found = granules(:, 4)==datevec(4) & granules(:, 5)==datevec(5);
                end
                
                nfound = sum(found);
                if nfound==0
                    error(errid, 'No datafile found for %s %s [%s]', self.name, spec, num2str(datevec));
                elseif nfound > 1
                    error(errid, 'Multiple datefiles found for %s %s', self.name, spec);
                else
                    datafile = paths{found};
                end
                
            end
            
        end        
        
        function [allgrans, allpaths] = find_granules_for_period(self, date1, date2, spec)
            
            % find_granules_for_period List all granules for sat/dataset for period
            %
            % For the period between date1 and date2, list all granules (as vectors
            % indicating the starting date/time) available.
            %
            % FORMAT
            %
            %   [allgrans, allpaths] = find_granules_for_period(date1, date2, spec)
            %
            % IN
            %
            %   date1   datevec     starting date
            %   date2   datevec     ending date
            %   spec    various
            %
            % OUT
            %
            %   allgrans matrix     all granules in daterange
            %   allpaths cellstr    all paths to those granules
            
            dates = daterange(date1, date2);
            ndates = size(dates, 1);
            
            allgrans = nan*zeros(ndates*15, 5);
            allpaths = cell(size(allgrans));
            
            n = 0;
            for i = 1:ndates
                date = dates(i, :);
                [grans, paths] = self.find_granules_by_date(date, spec, false);
                ngrans = size(grans, 1);
                allgrans(n+1:n+ngrans, :) = grans;
                allpaths(n+1:n+ngrans) = paths;
                n = n + ngrans;
            end
            
            to_remove = isnan(allgrans(:, 1));
            allgrans(to_remove, :) = [];
            allpaths(to_remove) = [];
            [allgrans, I] = sortrows(allgrans);
            allpaths = allpaths(I);

        end
        
        function data = read_granule(self, date, varargin)
            % read_granule Read any granule
            %
            % High-level function to read a particular granule for a
            % satellite/sensor-pair, if one doesn't know what function is suitable to
            % use to read it. It also gets rid of ballast.
            %
            % Uses member 'reader' to find out what function to use to
            % read this particular data. Uses self.granule_first_line to find out the first
            % scanline to use (e.g. the rest are doubles).
            %
            % FORMAT
            %
            %   M = read_granule(date[, spec[, extra[, remdouble[, force]]]])
            %
            % IN
            %
            %   datevec     vector      Date-vector indicating the starting time.
            %   spec        various     (optional) E.g. satellite (noaa18, cloudsat, etc.) 
            %   extra       cellstr     (optional) extra args to reader
            %   remdouble   logical     (optional) Remove doubles or not? Default true.
            %   force       logical     (optional) rather than throwing an error,
            %                           return [] in case of error. Default false.
            %
            % OUT
            %
            %   data        struct  Contains at least lat, lon, time and one or more
            %                       data fields. Exact results depend on the particular
            %                       satellite/sensor-pair.
            %
            % $Id$
             
            % optional arguments
            [spec, extra, remdouble, force] = optargs(varargin, {'', {}, true, false});
            
            % find datafile and function to read it
            datafile = self.find_granule_by_datetime(date, spec);
            
            % read datafile
            %logtext(colloc_config('stdout'), '%s(''%s'')\n', ...
            %    func2str(reader), datafile);
            logtext(atmlab('OUT'), 'Reading %s\n', datafile);
            try
                data = self.reader(datafile, extra);
            catch ME
                if force
                    switch ME.identifier
                        case 'atmlab:invalid_data'
                            logtext(atmlab('ERR'), ...
                                'Unable to read: %s\n', ME.message);
                            return
                        otherwise
                            ME.rethrow();
                    end
                else
                    ME.rethrow();
                end
            end
            %logtext(colloc_config('stdout'), 'read %s\n', datafile);
            nlines = size(data.time, 1);
            
            if remdouble
                % read ballast
                firstline = self.granule_first_line(date, spec);
                if firstline > 0
                    % get rid of ballast in all fields
                    for f = fieldnames(data)'
                        fn = f{1};
                        if isnumeric(data.(fn)) && size(data.(fn), 1) == nlines
                            % magic in next lines explained at
                            % http://www.mathworks.de/matlabcentral/newsreader/view_thread/290890
                            sz = size(data.(fn));
                            data.(fn) = reshape(data.(fn)(firstline:nlines, :), [nlines-firstline+1, sz(2:end)]);
                        end
                    end
                end
            end
        end
                
        function first = granule_first_line(self, d, varargin)
            % granule_first_line Returns first scanline not present in previous granule
            %
            % For a certain granule, return the number of the first scanline that
            % is not in the previous scanline. This m-file uses a previously created
            % database (a hashtable). This hash-table is cached between subsequent
            % calls of the function.
            %
            % There should exist an entry for each satellite/sensor granule. If it's
            % not found, an error is raised, and there is probably a bug somewhere. If
            % the satellite/sensor granule exists, but there is no (unique) previous
            % granule, line -2 is returned.
            %
            % FORMAT
            %
            %   first = granule_first_line(datevec[, spec[, reload]])
            %
            % IN
            %
            %   d           various     Starting date/time for granule,
            %                           datevec or unixsecs
            %   spec        various     (optional) Might be indicating satellite(s)
            %   reload      logical     (optional) Reload scanline data (i.e. not
            %                           cached). Defaults to false.
            %   force       logical     (optional) If nothing found, don't
            %                           throw an error, but return empty.
            %
            % OUT
            %
            %   first       number      First scanline not in previous granule.
            %                           Special values: -1 (no data found), -2 (no
            %                           previous granule found)
            
            persistent S;
            
            if isempty(S)
                S = struct;
            end
            
            [spec, reload, force] = optargs(varargin, {'', false, false});
            
            assert(ischar(spec), ['atmlab:' mfilename], ...
                'function works only for single-satellite datasets');
            if isempty(spec)
                sat = self.satname;
            else
                sat = spec;
            end
            
            if isfield(S, sat) && isfield(S.(sat), self.name) && ~reload
                ht = S.(sat).(self.name);
            else
                scanfile = datasets_config('firstline_data');
                scanfile = strrep(scanfile, '$SAT', sat);
                scanfile = strrep(scanfile, '$SENSOR', self.name);
                
                t = load(scanfile);
                ht = t.ht;
                S.(sat).(self.name) = ht;
            end
            
            if isscalar(d)
                unisecs = uint32(d);
            else
                dv = num2cell(d);
                unisecs = uint32(date2unixsecs(dv{:}));
            end
            
            first = ht.get(unisecs);
            if isempty(first) && ~force
                error(['atmlab:' mfilename ':missing_firstline'], ['no data found for %s @ %d. ', ...
                    'You may want to run self.find_granule_first_line(...) ', ...
                    'for a sufficiently long period.'], ...
                    self.name, unisecs);
            end
            
        end
        
        function find_granule_first_line(self, startdate, enddate, varargin)
            % find_granule_first_line Create hashtable with scanline overlaps
            %
            % Creates a hashtable (such as used by granule_first_line) that maps for
            % each granule the first scanline not occuring in the previous
            % scanline.
            %
            % The resulting hashtable is written to a file according to
            % datasets_config('firstline_data'), which is also where
            % granule_first_line is looking for it.
            %
            % FORMAT
            %
            %   find_scanline_first_line(startdate, enddate, spec)
            %
            % IN
            %
            %   startdate   datevec     start here
            %   enddate     datevec     end here
            %   spec        various     specification
            %
            % OUT
            %
            %   none, but writes a file
            
            spec = optargs(varargin, {''});            
            
            if isempty(spec)
                sat = self.satname;
            else
                sat = spec;    
            end

            scanfile = datasets_config('firstline_data');
            scanfile = strrep(scanfile, '$SAT', sat);
            scanfile = strrep(scanfile, '$SENSOR', self.name);
            
            logtext(atmlab('OUT'), 'Locating granules\n');
            allgrans = self.find_granules_for_period(startdate, enddate, spec);
            ngrans = size(allgrans, 1);
            logtext(atmlab('OUT'), 'Found %d granules\n', ngrans);
            
            if exist(scanfile, 'file')
                tm = load(scanfile);
                ht = tm.ht;
            else
                ht = java.util.Hashtable;
            end
            
            uni_first = date2unixsecs(allgrans(1, 1), allgrans(1, 2), allgrans(1, 3), allgrans(1, 4), allgrans(1, 5));
            if ~ht.containsKey(uint32(uni_first))
                logtext(atmlab('OUT'), 'Setting very first to flag\n');
                ht.put(uint32(uni_first), -5);
            end
            next = 0;
            
            for i = 1:ngrans-1
                logtext(atmlab('OUT'), 'granule %d/%d: %d-%02d-%02d %02d:%02d\n', i, ngrans-1, ...
                    allgrans(i, 1), allgrans(i, 2), allgrans(i, 3), allgrans(i, 4), allgrans(i, 5));
                uni = date2unixsecs(allgrans(i+1, 1), allgrans(i+1, 2), allgrans(i+1, 3), allgrans(i+1, 4), allgrans(i+1, 5));
                if ht.containsKey(uint32(uni)) && ht.get(uint32(uni)) ~= -5
                    logtext(atmlab('OUT'), 'Already exists (%d:%d)\n', uint32(uni), ht.get(uint32(uni)));
                    continue
                end
                try
                    couldreadcur = false;
                    if isequal(next, 0)
                        cur = self.read_granule(allgrans(i, :), spec, {}, false, false);
                    else
                        cur = next;
                    end
                    couldreadcur = true;
                    couldreadnext = false;
                    next = self.read_granule(allgrans(i+1, :), spec, {}, false, false);
                    couldreadnext = true;
                catch ME
                    switch ME.identifier
                        case {'atmlab:find_datafile_by_date', 'atmlab:atovs_get_l1c:zamsu2l1c', 'atmlab:invalid_data'}
                            logtext(atmlab('ERR'), 'Problem: %s\n', ME.message);
                        otherwise
                            ME.rethrow();
                    end
                end
                %uni = date2unixsecs(allgrans(i+1, 1), allgrans(i+1, 2), allgrans(i+1, 3), allgrans(i+1, 4), allgrans(i+1, 5));
                if couldreadcur && couldreadnext
                    t_cur = cur.epoch + cur.time;
                    t_next = next.epoch + next.time;
                    %[t_cur, t_next] = unify_time_axis(cur.time, next.time);
                    firstline = find(t_next > t_cur(end), 1, 'first');
                    if ~isempty(firstline)
                        logtext(atmlab('OUT'), 'First line: %d\n', firstline);
                        ht.put(uint32(uni), firstline);
                    else
                        logtext(atmlab('OUT'), 'No first line, setting to -2\n');
                        ht.put(uint32(uni), -2);
                    end
                elseif couldreadcur
                    logtext(atmlab('OUT'), 'Could not read next, setting next to -3\n');
                    ht.put(uint32(uni), -3);
                else
                    logtext(atmlab('OUT'), 'Could not read current. setting to -4\n');
                    ht.put(uint32(uni), -4);
                end
            end
            
            save(scanfile, 'ht');           
            
        end
        
        %% methods for internal use (not private, but from other classes)
       
        function add_collocated_dataset(ds, cd)
            % INTERNAL USE. Adds a collocated dataset.
            %
            % This is called by the CollocatedDataset constructor.
            ds.collocated_datasets = [ds.collocated_datasets cd];
        end
        
        function set.name(self, value)
            if ~isempty(self.name)
                % deregister old name
                datasets('delete', self);
            end
            self.name = value;
            datasets(self);
        end
    end
end