code

Using CsvHelper to read large CSV files

Robin Michael

Oct 30, 2018 • 3 min read

Working through a CSV file like this. This CSV file is produced automatically and contains hundreds of thousands of records, but the file itself is a collection of several CSV files appended together. Confusing yes, but CsvHelper can handle this at do it fast.

C,NEMP.WORLD,NEXT_DAY_DISPATCH,AEMO,PUBLIC,2018/10/04,04:10:00,0000000299705944,NEXT_DAY_DISPATCH,0000000299705943
I,DISPATCH,UNIT_SOLUTION,2,SETTLEMENTDATE,RUNNO,DUID,TRADETYPE,DISPATCHINTERVAL,INTERVENTION,CONNECTIONPOINTID,DISPATCHMODE,AGCSTATUS,INITIALMW,TOTALCLEARED,RAMPDOWNRATE,RAMPUPRATE,LOWER5MIN,LOWER60SEC,LOWER6SEC,RAISE5MIN,RAISE60SEC,RAISE6SEC,DOWNEPF,UPEPF,MARGINAL5MINVALUE,MARGINAL60SECVALUE,MARGINAL6SECVALUE,MARGINALVALUE,VIOLATION5MINDEGREE,VIOLATION60SECDEGREE,VIOLATION6SECDEGREE,VIOLATIONDEGREE,LASTCHANGED,LOWERREG,RAISEREG,AVAILABILITY,RAISE6SECFLAGS,RAISE60SECFLAGS,RAISE5MINFLAGS,RAISEREGFLAGS,LOWER6SECFLAGS,LOWER60SECFLAGS,LOWER5MINFLAGS,LOWERREGFLAGS,RAISEREGAVAILABILITY,RAISEREGENABLEMENTMAX,RAISEREGENABLEMENTMIN,LOWERREGAVAILABILITY,LOWERREGENABLEMENTMAX,LOWERREGENABLEMENTMIN,RAISE6SECACTUALAVAILABILITY,RAISE60SECACTUALAVAILABILITY,RAISE5MINACTUALAVAILABILITY,RAISEREGACTUALAVAILABILITY,LOWER6SECACTUALAVAILABILITY,LOWER60SECACTUALAVAILABILITY,LOWER5MINACTUALAVAILABILITY,LOWERREGACTUALAVAILABILITY,SEMIDISPATCHCAP
D,DISPATCH,UNIT_SOLUTION,2,"2018/10/03 04:05:00",1,AGLHAL,0,20181003001,0,SHPS1,0,0,0,0,720,720,0,0,0,0,0,0,,,,,,,,,,,"2018/10/03 04:00:07",0,0,156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
I,DISPATCH,LOCAL_PRICE,1,SETTLEMENTDATE,DUID,LOCAL_PRICE_ADJUSTMENT,LOCALLY_CONSTRAINED
D,DISPATCH,LOCAL_PRICE,1,"2018/10/03 14:50:00",ARWF1,-179.55,2
I,DISPATCH,OFFERTRK,1,SETTLEMENTDATE,DUID,BIDTYPE,BIDSETTLEMENTDATE,BIDOFFERDATE,LASTCHANGED
D,DISPATCH,OFFERTRK,1,"2018/10/03 04:05:00",AGLHAL,ENERGY,"2018/10/03 00:00:00","2018/10/02 10:43:05","2018/10/03 04:00:07"
I,DISPATCH,CONSTRAINT,5,SETTLEMENTDATE,RUNNO,CONSTRAINTID,DISPATCHINTERVAL,INTERVENTION,RHS,MARGINALVALUE,VIOLATIONDEGREE,LASTCHANGED,DUID,GENCONID_EFFECTIVEDATE,GENCONID_VERSIONNO,LHS
D,DISPATCH,CONSTRAINT,5,"2018/10/03 04:05:00",1,#BANN1_E,20181003001,0,32,0,0,"2018/10/03 04:00:07",BANN1,"2018/10/01 00:00:00",1,0
I,DISPATCH,MNSPBIDTRK,1,SETTLEMENTDATE,RUNNO,PARTICIPANTID,LINKID,OFFERSETTLEMENTDATE,OFFEREFFECTIVEDATE,OFFERVERSIONNO,LASTCHANGED
D,DISPATCH,MNSPBIDTRK,1,"2018/10/03 04:05:00",1,BASSLINK,BLNKTAS,"2018/10/03 00:00:00","2018/09/21 00:00:00",1,"2018/10/03 04:00:13"
C,"END OF REPORT",386560

I will define a PowerRecord and the associated ClassMap for translating it when using the CsvReader provided by CsvHelper.

public class PowerRecord
{
	public DateTime SettlementDate { get; set; }
	public string DUID { get; set; }
	public double InitialMW { get; set; }
	public bool SemiDispatchCap { get; set; }
}

Here the above DTO is registered for use when reading with CsvReader that this classmap which translates the records as read from raw string data record according to the the column name that matches and produces the appropriate result DTO.

internal sealed class PowerRecordMap : ClassMap<PowerRecord>
{
	public PowerRecordMap()
	{
		Map(a => a.SettlementDate).ConvertUsing(row => PowerRecordTimeStamp.ReadTimeStampUTC(row.GetField("SettlementDate")).GetValueOrDefault());                
		Map(a => a.DUID).ConvertUsing(row => row.GetField<string>("DUID"));            
		Map(a => a.InitialMW).ConvertUsing(row => row.GetField<double>("InitialMW"));
		Map(a => a.SemiDispatchCap).ConvertUsing(row =>
		{
			var value = row.GetField<int>("SemiDispatchCap");
			return value > 0;
		});
	}
}

Then if I define some extension methods to read the untransformed PowerData records as one big string. This is not as bad as it seems since the data is less than 50 MB at max which although could be streamed out is an optimization that can be left for later. This is a prove and validate the process and then refine task.

public static string AsCSV(this PowerData input)
{
	input = input ?? new PowerData {Rows = new List<string>()};
	input.HeaderRow = (input.HeaderRow ?? "").Trim();
	if (string.IsNullOrEmpty(input.HeaderRow))
	{
		return "";
	}
	var contents = new StringBuilder();            
	contents.AppendLine(input.HeaderRow.RemoveNewLines());
	foreach (var row in input.Rows)
	{
		contents.AppendLine(row.RemoveNewLines());
	}
	return contents.ToString();
}

Create an extension on the raw data to match the records as it changes throughout the file and when it does the columns will be changed.

private static CsvReader PowerReader(this PowerData input, string headerRow)
{
	input = input ?? new PowerData {Rows = new List<string>()};
	input.Rows = input.Rows ?? new List<string>();
	if (!input.Rows.Any() || !input.IsHeaderMatch(headerRow))
	{
		return null;
	}
	var contents = input.AsCSV();
	var csv = new CsvReader(new StringReader(contents));
	csv.Configuration.PrepareHeaderForMatch = a => a.ToUpperInvariant();
	csv.Configuration.TrimOptions = TrimOptions.Trim;
	return csv;
}

Now you can read the file into the separated files and use the registered classmap for the data records you want to extract in a small, digestable, and testable snippet of code.

using (var csv = input.PowerReader(PowerDataHeaders.Load))
{
	if (csv == null)
	{
		return new List<NemDispatchLoad>();
	}
	csv.Configuration.RegisterClassMap<PowerRecordMap>();
	var records = csv.GetRecords<PowerRecord>();
	return records.ToList();
}

Sign up for more like this.