利用DeepSeek编写使用libcsv解析csv文件并用libxlsxwriter写入xlsx文件的C程序
前面用Minicsv实现的程序不够完善,而libcsv更加完善,但调用方法比较复杂,我在网上搜到一篇liibcsv库解析csv格式文件的文章(作者:caojun97), 于是让DeepSeek将它改写为输出到xlsx,我采取了4个步骤。
第一步:
附件是使用libcsv解析csv文件的例子,把它改造成读入不固定列的csv,并输出第一行、第二行、及最后一行并根据转成数字的结果在每列后面标上类型N或T
第二步:
再添加功能,对一个大文件,先尝试读取20行,每列的最终类型由这20行决定,规则是,如果某列全是数字类型,则此列就是数字,否则是文本,如果第一行是标题行 命令行参数为-H=1,则以读取2-21行的结果确定,以上20由命令行参数 -R=20决定,规定第一个参数为文件名,如果行数不足就报错退出,仍然输出整个文件的首行、第二行和最后一行。
第三步:
数字类型包括整数和双精度浮点数,分别用I和D表示,请只给出修改此部分的代码,其他无需修改部分,用文字标出
第四步:
以刚才修改的带浮点数判断的csv读取程序为基础,把之前写xlsx的逻辑加入,数据无需每列临时判断,默认带标题行即H=1,无标题行用-H=0表示,减少取样行数为5行,带标题行的最后输出各列的标题及类型。
最终的程序如下
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include "csv.h"
#include "xlsxwriter.h"
#include <errno.h>#define MAX_COLS 256
#define MAX_LINE_LENGTH 65536
#define DEFAULT_SAMPLE_ROWS 5struct ColumnInfo {char *value;char type; // 'I' for integer, 'D' for double, 'T' for text
};struct CsvData {int row_count;int col_count;int sample_rows;bool has_header;bool sampling_complete;char *column_names[MAX_COLS]; // 列标题char column_types[MAX_COLS]; // 列类型lxw_workbook *workbook;lxw_worksheet *worksheet;
};// 检查字符串是否为数字并返回类型
char get_number_type(const char *str) {if (str == NULL || *str == '\0') return 'T';char *endptr;strtod(str, &endptr);if (*endptr != '\0') return 'T'; // 不是数字// 检查是否是整数if (strchr(str, '.') == NULL && strchr(str, 'e') == NULL && strchr(str, 'E') == NULL) {return 'I'; // 整数}return 'D'; // 浮点数
}// 初始化CSV数据结构
void init_csv_data(struct CsvData *csv_data, int sample_rows, bool has_header, const char *output_file) {memset(csv_data, 0, sizeof(struct CsvData));csv_data->sample_rows = sample_rows;csv_data->has_header = has_header;csv_data->workbook = workbook_new(output_file);csv_data->worksheet = workbook_add_worksheet(csv_data->workbook, NULL);
}// 释放CSV数据结构
void free_csv_data(struct CsvData *csv_data) {for (int i = 0; i < MAX_COLS; i++) {if (csv_data->column_names[i]) free(csv_data->column_names[i]);}if (csv_data->workbook) workbook_close(csv_data->workbook);
}// 列级数据处理回调函数
void col_callback(void *s, size_t len, void *data) {struct CsvData *csv_data = (struct CsvData *)data;char *str = malloc(len + 1);memset(str, 0, len + 1);memcpy(str, (char *)s, len);if (csv_data->col_count >= MAX_COLS) {free(str);return;}char num_type = get_number_type(str);// 处理标题行if (csv_data->row_count == 0 && csv_data->has_header) {csv_data->column_names[csv_data->col_count] = strdup(str);free(str);csv_data->col_count++;return;}// 采样阶段:确定列类型if (!csv_data->sampling_complete) {int effective_row = csv_data->has_header ? csv_data->row_count - 1 : csv_data->row_count;if (effective_row < csv_data->sample_rows) {// 初始化列类型if (effective_row == 0) {csv_data->column_types[csv_data->col_count] = num_type;} else {// 更新列类型(浮点数覆盖整数)if (csv_data->column_types[csv_data->col_count] == 'I' && num_type == 'D') {csv_data->column_types[csv_data->col_count] = 'D';} else if (num_type == 'T') {csv_data->column_types[csv_data->col_count] = 'T';}}// 如果是采样最后一行,标记完成if (effective_row == csv_data->sample_rows - 1) {csv_data->sampling_complete = (csv_data->col_count == MAX_COLS - 1);}}}// 写入Excelint excel_row = csv_data->has_header ? csv_data->row_count - 1 : csv_data->row_count;switch (csv_data->column_types[csv_data->col_count]) {case 'I':worksheet_write_number(csv_data->worksheet, excel_row, csv_data->col_count, atoi(str), NULL);break;case 'D':worksheet_write_number(csv_data->worksheet, excel_row, csv_data->col_count, atof(str), NULL);break;default:worksheet_write_string(csv_data->worksheet, excel_row, csv_data->col_count, str, NULL);}free(str);csv_data->col_count++;
}// 行级数据处理回调函数
void row_callback(int c, void *data) {struct CsvData *csv_data = (struct CsvData *)data;// 检查是否采样了足够的行int sampled_rows = csv_data->has_header ? (csv_data->row_count > 0 ? csv_data->row_count - 1 : 0) : csv_data->row_count;if (!csv_data->sampling_complete && sampled_rows >= csv_data->sample_rows) {csv_data->sampling_complete = true;}// 准备处理下一行csv_data->row_count++;csv_data->col_count = 0;
}int csv_parse_file(const char *file_path, struct CsvData *csv_data) {struct csv_parser conf;FILE *fp = NULL;size_t bytes_read = 0;size_t retval = 0;char buf[1024] = { 0 };int result = -1;if (csv_init(&conf, CSV_STRICT | CSV_STRICT_FINI) != 0) {printf("failed to initialize csv parser\n");return result;}fp = fopen(file_path, "rb");if (fp == NULL) {fprintf(stderr, "Failed to open file %s: %s\n", file_path, strerror(errno));goto END;}while ((bytes_read = fread(buf, sizeof(char), sizeof(buf), fp)) > 0) {retval = csv_parse(&conf, buf, bytes_read, col_callback, row_callback, csv_data);if (retval != bytes_read) {fprintf(stderr, "Error parsing file: %s\n", csv_strerror(csv_error(&conf)));goto END;}}if (0 != csv_fini(&conf, col_callback, row_callback, csv_data)) {goto END;}// 检查是否采样了足够的行int sampled_rows = csv_data->has_header ? (csv_data->row_count > 1 ? csv_data->row_count - 1 : 0) : csv_data->row_count;if (sampled_rows < csv_data->sample_rows) {fprintf(stderr, "Error: File has only %d rows (need at least %d rows for sampling)\n",csv_data->row_count, csv_data->has_header ? csv_data->sample_rows + 1 : csv_data->sample_rows);result = -2;goto END;}result = 0;END:csv_free(&conf);if (fp) fclose(fp);return result;
}void parse_args(int argc, char *argv[], int *sample_rows, bool *has_header, const char **input_file, const char **output_file) {*sample_rows = DEFAULT_SAMPLE_ROWS;*has_header = true; // 默认带标题行*input_file = NULL;*output_file = "output.xlsx";for (int i = 1; i < argc; i++) {if (strncmp(argv[i], "-H=", 3) == 0) {*has_header = atoi(argv[i] + 3) != 0;} else if (strncmp(argv[i], "-R=", 3) == 0) {*sample_rows = atoi(argv[i] + 3);if (*sample_rows <= 0) {fprintf(stderr, "Sample rows must be positive, using default %d\n", DEFAULT_SAMPLE_ROWS);*sample_rows = DEFAULT_SAMPLE_ROWS;}} else if (*input_file == NULL) {*input_file = argv[i];} else if (*output_file == NULL) {*output_file = argv[i];} else {fprintf(stderr, "Ignoring unknown argument: %s\n", argv[i]);}}if (*input_file == NULL) {fprintf(stderr, "Usage: %s <input.csv> [output.xlsx] [-H=0|1] [-R=sample_rows]\n", argv[0]);exit(EXIT_FAILURE);}
}int main(int argc, char *argv[]) {int sample_rows;bool has_header;const char *input_file;const char *output_file;parse_args(argc, argv, &sample_rows, &has_header, &input_file, &output_file);struct CsvData csv_data;init_csv_data(&csv_data, sample_rows, has_header, output_file);printf("Processing CSV file: %s\n", input_file);printf("Output XLSX file: %s\n", output_file);printf("Sample rows: %d\n", sample_rows);printf("Header row: %s\n", has_header ? "yes" : "no");if (csv_parse_file(input_file, &csv_data)!=0) {free_csv_data(&csv_data);return EXIT_FAILURE;}// 输出列信息if (has_header) {printf("\nColumn names and types:\n");for (int i = 0; i < MAX_COLS && csv_data.column_names[i]; i++) {printf("Column %d: %s (%c)\n", i+1, csv_data.column_names[i], csv_data.column_types[i]);}}printf("\nSuccessfully converted %d rows to XLSX\n", has_header ? csv_data.row_count - 1 : csv_data.row_count);free_csv_data(&csv_data);return EXIT_SUCCESS;
}
编译执行, 先将网上下载的libcsv源码中的csv.h和libcsv.c文件放在当前目录下
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:libxlsxwriter/libgcc libcsv.c ds_libcsv2xlsx.c -o ds_libcsv2xlsx -I . -O3 -I libxlsxwriter/include -lxlsxwriter -L libxlsxwriter/libtime ./ds_libcsv2xlsx NYC_311_SR_2010-2020-sample-1M.csv dsnyc.xlsx
Ignoring unknown argument: dsnyc.xlsx
Processing CSV file: NYC_311_SR_2010-2020-sample-1M.csv
Output XLSX file: output.xlsx
Sample rows: 5
Header row: yesColumn names and types:
Column 1: Unique Key (I)
Column 2: Created Date (T)
Column 3: Closed Date (T)
Column 4: Agency (T)
Column 5: Agency Name (T)
Column 6: Complaint Type (T)
Column 7: Descriptor (T)
Column 8: Location Type (T)
Column 9: Incident Zip (I)
Column 10: Incident Address (T)
Column 11: Street Name (T)
Column 12: Cross Street 1 (T)
Column 13: Cross Street 2 (T)
Column 14: Intersection Street 1 (T)
Column 15: Intersection Street 2 (T)
Column 16: Address Type (T)
Column 17: City (T)
Column 18: Landmark (T)
Column 19: Facility Type (T)
Column 20: Status (T)
Column 21: Due Date (T)
Column 22: Resolution Description (T)
Column 23: Resolution Action Updated Date (T)
Column 24: Community Board (T)
Column 25: BBL (T)
Column 26: Borough (T)
Column 27: X Coordinate (State Plane) (I)
Column 28: Y Coordinate (State Plane) (I)
Column 29: Open Data Channel Type (T)
Column 30: Park Facility Name (T)
Column 31: Park Borough (T)
Column 32: Vehicle Type (T)
Column 33: Taxi Company Borough (T)
Column 34: Taxi Pick Up Location (T)
Column 35: Bridge Highway Name (T)
Column 36: Bridge Highway Direction (T)
Column 37: Road Ramp (T)
Column 38: Bridge Highway Segment (T)
Column 39: Latitude (D)
Column 40: Longitude (D)
Column 41: Location (T)Successfully converted 1000000 rows to XLSXreal 1m6.379s
user 0m42.476s
sys 0m6.464s
测试生成的xlsx文件能被catamine 示例文件读取
time /par/calamine/target/release/examples/excel_to_csv /par/output.xlsx Sheet1real 0m29.629s
user 0m16.662s
sys 0m2.356s