classdef HomemadeDataset < SatDataset
% Any SatDataset that is locally created and stored.
%
% This class is an 'intermediate' class between SatDataset
% and CollocatedDataset, but also parent to AssociatedDataset.
% It is unlikely you wish to instantiate this class directly.
%
% This class contains methods and properties that are related to
% storing data to disk and reading data in the same (NetCDF) format
% again from disk. It inherits from SatDataset and is parent to
% CollocatedDataset and AssociatedDataset.
%
%
% HomemadeDataset Properties:
%
% cols - Structure describing columns for matrix-stored data
% mattype - Type to use internally. Normally 'double'.
% (remaining properties inherited from SatDataset.
% Use properties for a complete listing)
%
% HomemadeDataset Methods:
%
% Constructor:
%
% HomemadeDataset - Create HomemadeDataset object
%
% I/O:
%
% store - Store cols-described matrix to NetCDF
% read_single_day - Read a single day of stored data
% (remaining methods inherited from SatDataset)
%
% See also: SatDataset (superclass), CollocatedDataset (subclass),
% AssociatedDataset (abstract subclass)
%
% Don't forget the Collocation's User Guide.
% Note on implicit and explicit constructor calling:
% http://www.mathworks.se/help/matlab/matlab_oop/creating-subclasses--syntax-and-techniques.html
% $Id$
properties
% structure describing columns of (internally) stored data
%
% The cols-structure describes columns of data passed around
% internally as a matrix. For example, mid-level methods like
% CollocatedDataset.collocate_date
% return a matrix, whose columns are then described by the
% cols-member.
cols = struct;
% members;
% flag controlling how to consider granules already existing
%
% This flag, defaulting to 'false', controls how the toolkit treats
% files that are already there. For example, by default,
% collocations are not rerun and associateddatasets are not
% recalculated if files already exist. The exact behaviour depends
% on the class. Usually, the meaning is straightforward: false
% means do not overwrite, true means do so. This can be set per
% dataset, so if one wants to use the core but regenerate
% additionals, one can set the property for the additionals to
% true.
%
% Since atmlab-2-1-334, the value for FieldCopier and Collapser
% (NOTE: FIXME, ALL AssociatedDataset?) can have other values.
% Note that these work ONLY when processing is delayed; when
% processing is done along with the CollocatedDataset, the system
% will always regenerate additionals.
%
% overwrite = 2 read original files, and check per field, i.e.
% extend them without changing existing fields
%
% overwrite = {'cell', 'array', 'of', 'fields'}
%
% read original files, redo all fields in the
% cell array, keep others untouched.
overwrite = false;
% Set diskcache to store results of '.read' or others to disk.
%
% Runnig .read() can take considerable time. Therefore, it is worthwhile
% to cache the results. If 'pcd' is set to a PersistentCachedData,
% .read() will cache the results and read from disk if possible.
% WARNING: DO NOT SET PCD TO A DIRECTORY YOU ARE OTHERWISE
% USING! The caching mechanism may remove files from this
% directory, so be very careful. Caching works by calculating a key
% (almost certainly unique) based on input arguments and storing
% the results in a .mat-file in the cache-directory.
%
% Example use:
% >> d = datasets;
% >> d.collocation_mhs_amsub.pcd = PersistentCachedData('/local/gerrit/cache');
% >> [M, c] = d.collocation_mhs_amsub.read([2006 4 25], [2006 4 26], {'noaa18', 'noaa16'}, {'LAT1', 'LAT2', 'BT1'});
% (later)
% >> [M, c] = d.collocation_mhs_amsub.read([2006 4 25], [2006 4 26], {'noaa18', 'noaa16'}, {'LAT1', 'LAT2', 'BT1'});
% 14-May-2012 11:37:33.707:PersistentCachedData.get_entry:54:Reading from persistent cache: /local/gerrit/cache/aff3735816ae90b11a4ddfd571f88e3b.mat
pcd;
% Type to use for internal storage
%
% Internally, the collocation toolkit passes around collocations as
% matrices. To be on the safe side, this normally uses the type
% 'double'. If you are low on memory, or trying to store a lot of
% data and/or have a lot of collocations, you can set this to
% 'single'. Do not set it to any non-floating point type unless
% you want severe loss of precision and crashing code.
mattype = 'double';
% Users may set this for various purposes. Among others, it's used
% by strrep.
version;
end
properties (Dependent = true)
% contains detailed information on how data are stored etc.
%
% To be documented in more detail.
members;
end
properties (Access = protected)
% for internal usage
ownmembers;
dynamic_members = false;
end
methods
%% overload parent methods
function self = HomemadeDataset(varargin)
self = self@SatDataset(varargin{:});
%{
args = varargin;
% extract 'members' because SatDataset is not allowed to set
% HomemadeDataset private property, and it must be private in
% order for subclasses to redefine it etc.
setmem = false;
if any(strcmp(args, 'members'))
setmem = true;
i = find(strcmp(args, 'members'));
mem = args{i+1};
args = [args(1:i-1) args(i+2:end)];
end
self = self@SatDataset(args{:});
if setmem
self.members = mem;
end
%}
if ~any(strcmp(varargin, 'reader')) % set reader
self.reader = @self.read_homemade_granule;
end
if isequal(self.granule_duration, [])
self.granule_duration = 86400;
end
end
%% implement new methods
function [fn, global_atts] = store(self, date, spec, data, varargin)
% store Write collocation data to netcdf file
%
% Write collocation data for date in data to a netcdf file.
% The filename is determined from self, date, spec.
% This is a relatively low-level function and not normally
% called directly; rather call CollocatedDataset.collocate_and_store_date
%
% FORMAT
%
% [fn, global_atts] = obj.store(date, spec, data[, info])
%
% IN
%
% date vector [year month day] to which data corresponds
% spec string / cellstr, satellite (or so)
% data actual data, columns described by self.cols
% unless 'localcols' argument is given
% info (optional) struct with more info to put in NetCDF
% localcols structure describing columns of data
%
% OUT
%
% fn file data was written to
% atts global attributes that were written to file
%
% $Id$
if ~all(isfinite(data(:)))
warning(['atmlab:' mfilename ':format'], ...
'Found nans or infs in data. Data should be finite. I''ll do my best.');
end
fn = self.find_granule_by_datetime(date, spec);
[info, localcols] = optargs(varargin, {struct(), self.cols});
%year = date(1);
%month = date(2);
%day = date(3);
% create or append to the file
if exist(fn, 'file')
switch self.overwrite
case 0
error(['atmlab:' mfilename ':fileexists'], ...
['You really shouldn''t tell me not to overwrite ' ...
'and then tell me to store data where a file already exists.']);
case 1
newfile = true;
otherwise
% FIXME: should add a line to 'history' global
% attribute
newfile = false;
end
else
newfile = true;
end
% if strcmp(fn(end-1:end), 'gz') % take off this part
% fn(end-2:end) = '';
% end
outdir = fileparts(fn);
if ~exist(outdir, 'dir')
logtext(atmlab('OUT'), 'Creating %s\n', outdir);
mkdir(outdir);
end
%% Estimate size
ncollocs = size(data, 1);
nbytes = self.linesize(localcols) * ncollocs;
logtext(atmlab('OUT'), 'Will write %d collocations, %s of uncompressed data (not including header)\n', ncollocs, nbytes2string(nbytes));
if newfile
% temporary filename, later compressed written to final place
temp_out = tempname(atmlab('WORK_AREA'));
logtext(atmlab('OUT'), 'Writing %s\n', temp_out);
ncid = netcdf.create(temp_out, 'NC_CLOBBER'); % overwrite existing
else
logtext(atmlab('OUT'), 'Appending to %s\n', fn);
temp_out = uncompress(fn, atmlab('WORK_AREA'), struct('unidentified', 'error'));
ncid = netcdf.open(temp_out, 'WRITE');
netcdf.reDef(ncid); % put in header define mode
end
cleanupObj = onCleanup(@() self.cleanup(temp_out, ncid));
if newfile
% define the dimensions
dim_collocs = netcdf.defDim(ncid, 'Collocations', ncollocs);
% put global attributes
global_atts = struct();
global_atts.Conventions = 'CF-1.4';
global_atts.title = 'Collocations';
global_atts.date = iso_timestamp();
global_atts.institution = ['Department of Computer Science, Electrical and Space Engineering, Division of Space Technology, Lule' char(unicode2native('å')) ' University of Technology, Kiruna, Sweden'];
global_atts.source = 'Collocation codes, part of atmlab';
global_atts.references = 'Holl et al. (2010); John et al. (2012)';
global_atts.contact = 'gerrit.holl@gmail.com';
global_atts.software_version = atmlab_version;
global_atts.id = [atmlab_version() ' -- ' iso_timestamp() ' -- ' fn];
global_atts.license = ...
['This dataset is made available under the ' ...
'Open Data Commons Attribution License (ODC-By) v1.0 ' ...
'whose full text can be found at ' ...
'http://opendatacommons.org/licenses/by/1.0/. ' ...
'Any rights in individual contents of the dataset are ' ...
'licensed under the Open Data Commons Attribution License (ODC-By) v1.0 ' ...
'whose text can be found at http://opendatacommons.org/licenses/by/1.0/.'];
% add caller-contributed ones
%warning('off', 'catstruct:DuplicatesFound');
inf_fields = fieldnames(info);
for i = 1:length(inf_fields)
fldnm = inf_fields{i};
global_atts.(fldnm) = info.(fldnm);
end
%global_atts = catstruct(global_atts, info);
%warning('on', 'catstruct:DuplicatesFound');
addncattributes(ncid, global_atts)
else
dim_collocs = netcdf.inqDimID(ncid, 'Collocations');
end
%% define variables, variable attributes, additional dimensions
vars = fieldnames(localcols);
varids = zeros(size(vars));
dims = struct();
for j = 1:length(vars)
varname = vars{j};
type = self.members.(varname).type;
if isfield(self.members.(varname), 'atts')
atts = self.members.(varname).atts;
else
atts = struct();
end
% check if we have other dimensions besides the length
if isfield(self.members.(varname), 'dims') && size(data, 1)>0
dimname = self.members.(varname).dims{1};
dimsize = self.members.(varname).dims{2};
try
if ~isfield(dims, dimname)
dims.(dimname) = netcdf.defDim(ncid, dimname, dimsize);
end
catch ME
switch ME.identifier
case {'MATLAB:netcdf:defDim:nameIsAlreadyInUse', 'MATLAB:imagesci:netcdf:libraryFailure'}
% no problem
dims.(dimname) = netcdf.inqDimID(ncid, dimname);
otherwise
ME.rethrow();
end
end
thisdim = [dim_collocs dims.(dimname)];
else
thisdim = dim_collocs;
end
% define variable and put attributes
try
% if this doesn't fail, the variable already exists
varid = netcdf.inqVarID(ncid, varname);
% while we're at it, let's check the dimensions
[~, ~, dids] = netcdf.inqVar(ncid, varid);
[~, stored_n_collocs] = netcdf.inqDim(ncid, dids(1));
if size(data, 1) ~= stored_n_collocs
errstr = sprintf(['NetCDF file contains data for variable %s ' ...
'with a different size than the data I''m trying to store. ' ...
' Stored no. of entries is %d, whereas new size is %d entries. '], ...
varname, stored_n_collocs, size(data, 1));
if self.overwrite == 2 && isa(self, 'Collapser')
errstr = [errstr, ...
sprintf([' Probably the original collapsing ' ...
'is from the era where entries where removed if ' ...
'all data were flagged, even if the collocation was ' ...
'otherwise valid. When additional data is added that ' ...
'is not flagged, this results in a different no. of ' ...
'collocations. The only solution is to set %s.overwrite ' ...
'to 1. Sorry about that :('], self.name)];
else
errstr = [errstr, ...
sprintf([' I''m not sure why I''m even ' ...
'here in the first place, because I''m not trying ' ...
'to extend a Collapser (I''m a %s in overwrite ' ...
'mode %d'], class(self), self.overwrite)];
end
error(['atmlab:' mfilename ':wrongsize'], errstr);
end
varids(j) = -1;
catch ME
switch ME.identifier
case 'MATLAB:imagesci:netcdf:libraryFailure'
varid = netcdf.defVar(ncid, varname, type, thisdim);
varids(j) = varid;
for k = fieldnames(atts)'
netcdf.putAtt(ncid, varid, k{1}, atts.(k{1}));
end
otherwise
ME.rethrow();
end
end
end
%% write data
% end define mode
netcdf.endDef(ncid);
if isempty(data)
logtext(atmlab('OUT'), 'No data, NetCDF file will be dataless\n');
else
% put vars
logtext(atmlab('OUT'), 'Writing: ');
for j = 1:length(vars(:).')
varname = vars{j};
varid = varids(j);
if varid == -1
fprintf(atmlab('OUT'), '(not: %s) ', varname);
continue
end
fprintf(atmlab('OUT'), '%s ', varname);
if any(any(~isfinite(data(:, localcols.(varname)))))
if ~isfield(self.members.(varname).atts, 'missing_value')
error(['atmlab:' mfilename ':missingmissing'], ...
['I found nonfinite values for field %s, ' ...
'but no missing_value is defined for ' ...
'%s.members.%s.atts.missing_value. Please define.'], ...
varname, self.name, varname);
end
% need to flag only the relevant channels. Matlabs
% lacks views, so I have to do this in a really
% cumbersome way.
chanmask = false(size(data));
chanmask(:, localcols.(varname)) = true;
invmask = ~isfinite(data);
data(chanmask&invmask) = self.members.(varname).atts.missing_value;
%data(any(~isfinite(data(:, localcols.(varname)))), localcols.(varname)) = self.members.(varname).atts.missing_value;
end
netcdf.putVar(ncid, varid, data(:, localcols.(varname)));
end
fprintf(atmlab('OUT'), '\n');
end
logtext(atmlab('OUT'), 'Finalising\n');
logtext(atmlab('OUT'), 'Gzipping to %s and removing uncompressed\n', fn);
netcdf.close(ncid);
gzipped_filename = gzip(temp_out, outdir);
if ~isequal(gzipped_filename{1}, fn)
movefile(gzipped_filename{1}, fn);
end
logtext(atmlab('OUT'), 'Done\n');
end
function fields = list_fields(self)
% return valid fields in this dataset
fields = fieldnames(self.members);
end
function S = cast_fields_back(self, M, c)
% Cast columns from data-matrix to 'stored' types
%
% The collocation toolkit internally passes around data in a
% matrix. To be on the safe side, this uses double, because
% all other types (except (u)int64) fit. However, sometimes
% this is not desirable, and this method can be used to cast
% types back to their types, based on the type to which they
% should be stored.
%
% FORMAT
%
% varargout = hd.cast_fields_back(M, c,)
%
% IN
%
% M matrix containing data
% c structure describing columns by name
%
% OUT
%
% S structure with data in smaller type
S = struct();
fields = fieldnames(c);
for i = 1:length(fields)
field = fields{i};
newtype = type_nc2ml(self.members.(field).type);
S.(field) = cast(M(:, c.(field)), newtype);
end
end
function nbytes = linesize(self, localcols)
% Calculate the stored size (in bytes) per collocation
nbytes = 0;
sizes = struct(...
'float', 4, ...
'double', 8, ...
'short', 2, ...
'int', 4, ...
'byte', 1);
fields = fieldnames(localcols);
for f = vec2row(fields)
tp = self.members.(f{1}).type;
width = length(localcols.(f{1}));
nbytes = nbytes + sizes.(tp) * width;
end
end
function first = granule_first_line(varargin)
first = int32(1);
end
%% getters/setters
function mem = get.members(self)
% members getter, see doc for property members
if self.dynamic_members
mem = self.dynamically_get_members();
else
mem = self.ownmembers;
end
end
function set.members(self, val)
if self.dynamic_members
error(['atmlab:' mfilename ':readonly'], ...
['The property ''members'' for class %s is dynamically calculated. ' ...
'Therefore, you cannot set the members property for %s.'], ...
class(self), self.name);
else
self.ownmembers = val;
end
end
end
methods (Access = protected)
%% overload parent methods
function s_out = repvars(self, s, datevec, spec)
s = repvars@SatDataset(self, s, datevec, spec);
if isempty(self.version)
s_out = s;
else
s_out = strrep_multi(s, ...
'$VERSION', strrep(self.version, '.', '_'));
end
end
function matches = infofit(self, is, datevec, spec)
matches = infofit@SatDataset(self, is, datevec, spec);
if isfield(is, 'version') && ~isempty(self.version)
matches = matches && isequal(is.version, strrep(self.version, '.', '_'));
elseif isempty(self.version)
matches = true;
else
matches = false;
end
end
%% implement new methods
% those are for internal use, user uses
% CollocatedDataset.read
function [M, localcols, attr] = read_single_day(self, date, spec, fields)
% Read collocation 'fields' for 'date', 'spec'.
%
% Low-level function, not normally called directly. To read
% collocated data, use CollocatedDataset.read.
%
% Reads a single day of data.
%
% FORMAT
%
% [M, localcols, attr] = ds.read_single_day(date, spec, fields)
%
% IN
%
% date datevec datevec for which to read data
% spec various satellite(s)
% fields cellstr fields to read from data.
%
% Note:
%
% 'fields' may be 'all', in which case all fields are read.
% No guarantee about the order, but this information is
% returned via 'localcols'. If you need consistence of order
% with actual collocating, consider passing
% fieldnames(self.cols) as fields.
%
% OUT
%
% M matrix contains requested data
% localcols struct describes data columns
% attr struct contains NetCDF global attributes
%
% See also: HomemadeDataset/read_homemade_granule
fn = self.find_granule_by_datetime(date, spec);
logtext(atmlab('OUT'), 'Gunzipping and reading %s\n', fn);
tmp = tempname(atmlab('WORK_AREA'));
c = onCleanup(@()delete(tmp));
exec_system_cmd(['gunzip -c ' fn '>' tmp]); % 3x faster than ML's gunzip
if ischar(fields)
if strcmp(fields, 'all')
data = loadncfile(tmp);
attr = data.global_attributes;
fields = fieldnames(data);
else
error(['atmlab:' mfilename ':invalid'], ...
['Invalid ''fields'' argument: ' fields]);
end
else
[data, attr] = loadncvar(tmp, fields);
end
% find n. of columns to allocate, keep order
data_fields = intersect_unsorted(fields, fieldnames(self.members));
%n_columns = sum(cellfun(@(ff) length(self.cols.(ff)), data_fields));
n_columns = sum(cellfun(@(ff) size(data.(ff), 2), data_fields));
n_rows = max(cellfun(@(ff) size(data.(ff), 1), fields));
M = nan*zeros(n_rows, n_columns);
if isempty(M)
localcols = struct();
return;
end
n = 1;
% copy data to matrix
for i = 1:length(data_fields)
fld = data_fields{i};
%n_local_cols = length(self.cols.(fld));
n_local_cols = size(data.(fld), 2);
range_local_cols = n:(n+n_local_cols-1);
D = data.(fld);
M(:, range_local_cols) = D;
% verify data validity; should be done on writing but
% wasn't always in the past, so still needed here
if any(isfield(self.members.(fld).atts, {'valid_range', 'valid_min', 'valid_max'}))
if isfield(self.members.(fld).atts, 'valid_range')
lo = self.members.(fld).atts.valid_range(1);
hi = self.members.(fld).atts.valid_range(2);
else
% fallbacks
if isinteger(D)
getlow = @intmin;
gethi = @intmax;
elseif isfloat(D)
getlow = @realmin;
gethi = @realmax;
else
error(['atmlab:' mfilename ':unknown'], ...
'Non-numeric type unsupported, %s is %s', fld, class(fld));
end
lo = getlow(class(data.(fld)));
hi = gethi(class(data.(fld)));
if isfield(self.members.(fld).atts, 'valid_min')
lo = self.members.(fld).atts.valid_min;
end
if isfield(self.members.(fld).atts, 'valid_max')
hi = self.members.(fld).atts.valid_max;
end
end
%
% FIXME: test that lo < data < hi || data==missing
wrong = (D < lo | D > hi);
if isfield(self.members.(fld).atts, 'missing_value')
wrong = wrong & (D ~= self.members.(fld).atts.missing_value);
end
if any(wrong)
error(['atmlab:' mfilename ':invalid'], ...
['Encountered invalid data for %s %s at %s. Data for field %s ' ...
'must be in range %g -- %g or flagged. Found ' ...
'value %g instead. Perhaps an artefact from before ' ...
'the proper use of flags. Suggest to recollocate ' ...
'or redo AssociatedDataset.'], ...
class(self), self.name, datestr(datenum(date), 'yyyy-mm-dd'), ...
fld, lo, hi, D(find(wrong, 1)));
end
end
% set flagged to nan
if isfield(self.members.(fld).atts, 'missing_value')
flagged = M(:, range_local_cols) == self.members.(fld).atts.missing_value;
M(all(flagged, 2), range_local_cols) = nan;
end
localcols.(fld) = range_local_cols;
n = n + n_local_cols;
end
if isempty(data_fields)
localcols = struct();
end
end
function pos2re(self)
% convert self.{basedir,subdir,filename} to self.re
%
% This method attempts to convert
% self.{basedir,subdir,filename) to self.re at a best-effort
% basis.
r = [fullfile(self.basedir, self.subdir, self.filename) ...
'|' self.filename];
self.re = strrep_multi(r, ...
'$YEAR4', '(?\d{4})', ...
'$MONTH', '(?\d{2})', ...
'$DAY', '(?\d{2})', ...
'$YEAR2', '(?\d{2})', ...
'$DOY', '(?\d{3})', ...
'$HOUR', '(?\d{2})', ...
'$MINUTE', '(?\d{2})', ...
'$SAT', '(?[a-z0-9]*)', ...
'$VERSION', '(?[a-z0-9_]{3,4})');
% However, we don't need it to locate granules. It's only used
% by find_info_from_granule.
self.tryre = false;
end
end
methods (Access = {?SatDataset})
function [S, strattr] = read_homemade_granule(self, file, varargin)
% internal usage, reader for any granule made by this toolkit
%
% hd.read_homemade_granule(file, [fields])
fields = optargs(varargin, {{}});
[S, strattr] = loadncvar(file, fields);
% get additional stuff
info = self.find_info_from_granule(file);
if isfield(info, 'doy')
date = dayofyear_inverse(str2double(info.year), str2double(info.doy));
date = [date.year date.month date.day];
else
date = [str2double(info.year) str2double(info.month) str2double(info.day)];
end
% NOTE 2015-03-04: Here was an erroneous bugfix between
% 2015-01-22 and 2015-03-04. For most homemade-datasets,
% times are expressed in seconds since the start of the date,
% and epoch is set accordingly. Not so for SpareICE, which
% failed to collocate.
S.epoch = round(date2unixsecs(date(1), date(2), date(3)));
S.path = file;
if isfield(strattr, 'version')
S.version = strattr.version;
end
if isfield(S, 'lat')
S = MaskInvalidGeoTimedataWithNaN(S);
end
end
end
% static/private are used like subfunctions
methods (Static, Access = protected)
function cleanup(temp_out, ncid)
% remove temporary files and close NetCDF
logtext(atmlab('OUT'), 'Cleaning up\n');
try
netcdf.close(ncid);
catch ME
switch ME.identifier
case {'MATLAB:netcdf:inq:notNetcdfID', 'MATLAB:netcdf:close:notNetcdfID', ...
'MATLAB:netcdf:close:ebadid:notNetcdfID', 'MATLAB:imagesci:netcdf:libraryFailure'} % already closed
otherwise
delete(temp_out);
ME.rethrow();
end
end
delete(temp_out);
end
end
end