前一篇文章说了如何用c语言读取CSV文件并保存为二维数组
,这一篇文章将会在将对得到二维数组根据某一个值进行划分操作
CSV文件
x1,x2,x3,x4,y
3.6216,8.6661,-2.8073,-0.44699,0
4.5459,8.1674,-2.4586,-1.4621,0
3.866,-2.6383,1.9242,0.10645,0
3.4566,9.5228,-4.0112,-3.5944,1
0.32924,-4.4552,4.5718,-0.9888,1
4.3684,9.6718,-3.9606,-3.1625,1
以 x2=8.1674" role="presentation" >
x2=8.1674
为界限,把二维数组划分为两部分,对于二维数组中的每一行来说x2" role="presentation" >
x2
的值大于8.1674" role="presentation" >
8.1674
的存储在一个二维数组里面,小于8.1674" role="presentation" >
8.1674
的每一行存储在一个二维数组里面
思路很简单,就是利用循环来进行划分操作
先来一个Python版本的伪代码,比较直观:
def split(index, value, dataset):
left, right = list(), list()
for row in dataset:
if row[index] else:
right.append(row)
return left, right
c语言里面没有append()" role="presentation" >
append()
操作,只能利用双重循环来进行赋值,并且我们要利用两个链表来记录所在行x2" role="presentation" >
x2
的值小于和大于8.1674" role="presentation" >
8.1674
值所在行的行下标
完整的c语言代码
#include
#include
#include
struct node{
int data;
struct node *next;
};
void get_two_dimension(char* line, double** data, char *filename);
void print_two_dimension(double** data, int row, int col);
int get_row(char *filename);
int get_col(char *filename);
int Gini(int** train_data, int* target);
void split_data(double **data, int index, int value, int row, int col);
int main()
{
char filename[] = "C:\\Users\\...\\...\\csvtest.csv";
char line[1024];
double **data;
int row, col;
int index = 1;
double value = 8.1674;
row = get_row(filename);
col = get_col(filename);
data = (double **)malloc(row * sizeof(int *));
for (int i = 0; i double *)malloc(col * sizeof(double));
}
get_two_dimension(line, data, filename);
printf("row = %d\n", row);
printf("col = %d\n", col);
split_data(data, index, value, row, col);
}
void get_two_dimension(char* line, double** data, char *filename)
{
FILE* stream = fopen(filename, "r");
int i = 0;
while (fgets(line, 1024, stream))
{
int j = 0;
char *tok;
char* tmp = strdup(line);
for (tok = strtok(line, ","); tok && *tok; j++, tok = strtok(NULL, ",\n")){
data[i][j] = atof(tok);
}
i++;
free(tmp);
}
fclose(stream);
}
void print_two_dimension(double** data, int row, int col)
{
int i, j;
for(i=1; ifor(j=0; jprintf("%f\t", data[i][j]);
}
printf("\n");
}
}
int get_row(char *filename)
{
char line[1024];
int i;
FILE* stream = fopen(filename, "r");
while(fgets(line, 1024, stream)){
i++;
}
fclose(stream);
return i;
}
int get_col(char *filename)
{
char line[1024];
int i = 0;
FILE* stream = fopen(filename, "r");
fgets(line, 1024, stream);
char* token = strtok(line, ",");
while(token){
token = strtok(NULL, ",");
i++;
}
fclose(stream);
return i;
}
void split_data(double **data, int index, int value, int row, int col)
{
node *left_node, *right_node;
left_node = (node *)malloc(sizeof(node));
right_node = (node *)malloc(sizeof(node));
left_node->next = NULL;
right_node->next = NULL;
int count_left_size=0, i, count_right_size=0, j, k;
for(i=1; iif(data[i][index] else
count_right_size++;
}
printf("left size = %d\n", count_left_size);
printf("right size = %d\n", count_right_size);
double left_array[count_left_size][col], right_array[count_right_size][col];
for(i=1; iif(data[i][index] malloc(sizeof(node));
p->data = i;
p->next = left_node->next;
left_node->next = p;
}
else{
node *q;
q = (node *)malloc(sizeof(node));
q->data = i;
q->next = right_node->next;
right_node->next = q;
}
}
left_node = left_node->next;
right_node = right_node->next;
i = 0;
while(left_node){
for(j=0; jdata][j];
}
left_node = left_node->next;
i++;
}
i = 0;
while(right_node){
for(j=0; jdata][j];
}
right_node = right_node->next;
i++;
}
for(i=0; iprintf("I'm left\n");
for(j=0; jprintf("%f\t", left_array[i][j]);
}
printf("\n");
}
for(i=0; iprintf("I'm right\n");
for(j=0; jprintf("%f\t", right_array[i][j]);
}
printf("\n");
}
}
运行结果
row = 7
col = 5
left size = 2
right size = 4
I
0.329240 -4.455200 4.571800 -0.988800 1.000000
I
3.866000 -2.638300 1.924200 0.106450 0.000000
I
4.368400 9.671800 -3.960600 -3.162500 1.000000
I
3.456600 9.522800 -4.011200 -3.594400 1.000000
I
4.545900 8.167400 -2.458600 -1.462100 0.000000
I
3.621600 8.666100 -2.807300 -0.446990 0.000000
可以直观的感觉到Python代码是多么的简洁,不过在用c语言实现时用到了单链表,终于感觉单链表没白学了。