VersionBay decided to revisit the earlier post: Lightning Fast – Parsing Multiple CSV Files as there was an interesting note in the release notes of R2019a:
Import Tool now functions consistently across different platforms and generates code that is easy to read for importing text files. For more information, see Import Text File Data Using Import Tool.
MATLAB Release Notes R2019a
Here is the newly generated function, that is cleaner and much faster compared to what was generated with R2018b.
R2019a
function FLinsurancesample = importfileR2019a(filename, dataLines) %importfileR2019a Import data from a text file % FLINSURANCESAMPLE = importfileR2019a(FILENAME) reads data from text file % FILENAME for the default selection. Returns the data as a table. % % FLINSURANCESAMPLE = importfileR2019a(FILE, DATALINES) reads data for the % specified row interval(s) of text file FILENAME. Specify DATALINES as % a positive scalar integer or a N-by-2 array of positive scalar % integers for dis-contiguous row intervals. % % Example: % FLinsurancesample = importfileR2019a("FL_insurance_sample.csv", [2, Inf]); % % See also READTABLE. % % Auto-generated by MATLAB on 29-Mar-2019 16:01:06 %% Input handling % If dataLines is not specified, define defaults if nargin < 2 dataLines = [2, Inf]; end %% Setup the Import Options opts = delimitedTextImportOptions("NumVariables", 18); % Specify range and delimiter opts.DataLines = dataLines; opts.Delimiter = ","; % Specify column names and types opts.VariableNames = ["policyID", "statecode", "county", "eq_site_limit", "hu_site_limit", "fl_site_limit", "fr_site_limit", "tiv_2011", "tiv_2012", "eq_site_deductible", "hu_site_deductible", "fl_site_deductible", "fr_site_deductible", "point_latitude", "point_longitude", "line", "construction", "point_granularity"]; opts.VariableTypes = ["double", "categorical", "categorical", "double", "double", "double", "double", "double", "double", "double", "double", "double", "double", "double", "double", "categorical", "categorical", "double"]; opts = setvaropts(opts, [2, 3, 16, 17], "EmptyFieldRule", "auto"); opts.ExtraColumnsRule = "ignore"; opts.EmptyLineRule = "read"; % Import the data FLinsurancesample = readtable(filename, opts); end
R2018b
function FLinsurancesample = importfile(filename, startRow, endRow) %IMPORTFILE Import numeric data from a text file as a matrix. % FLINSURANCESAMPLE = IMPORTFILE(FILENAME) Reads data from text file % FILENAME for the default selection. % % FLINSURANCESAMPLE = IMPORTFILE(FILENAME, STARTROW, ENDROW) Reads data % from rows STARTROW through ENDROW of text file FILENAME. % % Example: % FLinsurancesample = importfile('FL_insurance_sample.csv', 2, 36635); % % See also TEXTSCAN. % Auto-generated by MATLAB on 2018/12/14 16:58:46 %% Initialize variables. delimiter = ','; if nargin<=2 startRow = 2; endRow = inf; end %% Read columns of data as text: % For more information, see the TEXTSCAN documentation. formatSpec = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%[^\n\r]'; %% Open the text file. fileID = fopen(filename,'r'); %% Read columns of data according to the format. % This call is based on the structure of the file used to generate this % code. If an error occurs for a different file, try regenerating the code % from the Import Tool. dataArray = textscan(fileID, formatSpec, endRow(1)-startRow(1)+1, 'Delimiter', delimiter, 'TextType', 'string', 'HeaderLines', startRow(1)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n'); for block=2:length(startRow) frewind(fileID); dataArrayBlock = textscan(fileID, formatSpec, endRow(block)-startRow(block)+1, 'Delimiter', delimiter, 'TextType', 'string', 'HeaderLines', startRow(block)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n'); for col=1:length(dataArray) dataArray{col} = [dataArray{col};dataArrayBlock{col}]; end end %% Close the text file. fclose(fileID); %% Convert the contents of columns containing numeric text to numbers. % Replace non-numeric text with NaN. raw = repmat({''},length(dataArray{1}),length(dataArray)-1); for col=1:length(dataArray)-1 raw(1:length(dataArray{col}),col) = mat2cell(dataArray{col}, ones(length(dataArray{col}), 1)); end numericData = NaN(size(dataArray{1},1),size(dataArray,2)); for col=[1,4,5,6,7,8,9,10,11,12,13,14,15,18] % Converts text in the input cell array to numbers. Replaced non-numeric % text with NaN. rawData = dataArray{col}; for row=1:size(rawData, 1) % Create a regular expression to detect and remove non-numeric prefixes and % suffixes. regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)'; try result = regexp(rawData(row), regexstr, 'names'); numbers = result.numbers; % Detected commas in non-thousand locations. invalidThousandsSeparator = false; if numbers.contains(',') thousandsRegExp = '^\d+?(\,\d{3})*\.{0,1}\d*$'; if isempty(regexp(numbers, thousandsRegExp, 'once')) numbers = NaN; invalidThousandsSeparator = true; end end % Convert numeric text to numbers. if ~invalidThousandsSeparator numbers = textscan(char(strrep(numbers, ',', '')), '%f'); numericData(row, col) = numbers{1}; raw{row, col} = numbers{1}; end catch raw{row, col} = rawData{row}; end end end %% Split data into numeric and string columns. rawNumericColumns = raw(:, [1,4,5,6,7,8,9,10,11,12,13,14,15,18]); rawStringColumns = string(raw(:, [2,3,16,17])); %% Replace non-numeric cells with NaN R = cellfun(@(x) ~isnumeric(x) && ~islogical(x),rawNumericColumns); % Find non-numeric cells rawNumericColumns(R) = {NaN}; % Replace non-numeric cells %% Make sure any text containing <undefined> is properly converted to an <undefined> categorical for catIdx = [1,2,3,4] idx = (rawStringColumns(:, catIdx) == "<undefined>"); rawStringColumns(idx, catIdx) = ""; end %% Create output variable FLinsurancesample = table; FLinsurancesample.policyID = cell2mat(rawNumericColumns(:, 1)); FLinsurancesample.statecode = categorical(rawStringColumns(:, 1)); FLinsurancesample.county = categorical(rawStringColumns(:, 2)); FLinsurancesample.eq_site_limit = cell2mat(rawNumericColumns(:, 2)); FLinsurancesample.hu_site_limit = cell2mat(rawNumericColumns(:, 3)); FLinsurancesample.fl_site_limit = cell2mat(rawNumericColumns(:, 4)); FLinsurancesample.fr_site_limit = cell2mat(rawNumericColumns(:, 5)); FLinsurancesample.tiv_2011 = cell2mat(rawNumericColumns(:, 6)); FLinsurancesample.tiv_2012 = cell2mat(rawNumericColumns(:, 7)); FLinsurancesample.eq_site_deductible = cell2mat(rawNumericColumns(:, 8)); FLinsurancesample.hu_site_deductible = cell2mat(rawNumericColumns(:, 9)); FLinsurancesample.fl_site_deductible = cell2mat(rawNumericColumns(:, 10)); FLinsurancesample.fr_site_deductible = cell2mat(rawNumericColumns(:, 11)); FLinsurancesample.point_latitude = cell2mat(rawNumericColumns(:, 12)); FLinsurancesample.point_longitude = cell2mat(rawNumericColumns(:, 13)); FLinsurancesample.line = categorical(rawStringColumns(:, 3)); FLinsurancesample.construction = categorical(rawStringColumns(:, 4)); FLinsurancesample.point_granularity = cell2mat(rawNumericColumns(:, 14));