在行pandas中将零值替换为一个值并将其替换为零

benim 发布于 2019-11-08 pandas 最后更新 2019-11-08 22:59 18 浏览

我构建了一个函数,用dummy来标准化属性数据集。如果每行的一个值的数量>到零的数量,我希望将值从零变更为一,从一到零更改值:

def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    n_zero = (data == 0).astype(int).sum(axis=1)
    n_uno = (data == 1).astype(int).sum(axis=1)
    for i in range(len(n_zero)):
        if n_uno[i] > n_zero[i]:
           #replace_values = {0: 1, 1: 0}   
           #data.iloc[i] = data.iloc[i].replace({data.iloc[i]: replace_values}) 
           data.iloc[i] = data.iloc[i].map({0 : 1})
           data.iloc[i] = data.iloc[i].map({1 : 0})
    return data
dummy_columns = ["ATTRIBUTE1",..."ATTRIBIUTE N"]
df=dummy_data(df, dummy_columns)
该函数不会取代我的零和一个值
已邀请:

stotam

赞同来自:

使用numpy logical_not快速查找和反转1和0的方法:

def dummy_data(data_df, dummy_columns):
static_df = data_df[list(set(data.columns) - set(dummy_columns))]
    df = pd.get_dummies(data_df[dummy_columns])
    vals = df.values
    ones_count = np.add.reduce(vals, axis=1)
    zeros_count = np.add.reduce(np.logical_not(vals), axis=1)
    idx = np.where(ones_count > zeros_count)[0]
vals[idx, :] = np.logical_not(vals[idx, :])
    result_df = pd.concat([static_df, pd.DataFrame(vals, index=df.index, columns=df.columns)], axis=1)
return result_df

cvitae

赞同来自:

我认为你需要:

def dummy_data(data, columns):
    #get_dummies with all columns together
    data =  pd.concat([data, pd.get_dummies(data[columns])], axis=1).drop(columns, axis=1)
    #convert to int not necessary
    n_zero = (data == 0).sum(axis=1)
    n_uno = (data == 1).sum(axis=1)
    #replace by condition without loop
    m = n_uno > n_zero
    data = data.mask(m, data.replace({0:1,1:0}))
return data
样品:
df = pd.DataFrame({'A':list('abb'),
                   'B':list('bbb'),
                   'C':list('baa'),
                   'D':list('aaa')})
print (df)
   A  B  C  D
0  a  b  b  a
1  b  b  a  a
2  b  b  a  a

def dummy_data(data, columns):
    data =  pd.concat([data, pd.get_dummies(data[columns])], axis=1).drop(columns, axis=1)
    print (data)
D  A_a  A_b  B_b  C_a  C_b
0  a    1    0    1    0    1
1  a    0    1    1    1    0
2  a    0    1    1    1    0
n_zero = (data == 0).sum(axis=1)
    n_uno = (data == 1).sum(axis=1)
    m = n_uno > n_zero
    print (m)
0    True
1    True
2    True
dtype: bool
data = data.mask(m, data.replace({0:1,1:0}))
return data
dummy_columns = ['A','B', 'C']
df = dummy_data(df, dummy_columns)
print (df)
D  A_a  A_b  B_b  C_a  C_b
0  a    0    1    0    1    0
1  a    1    0    0    0    1
2  a    1    0    0    0    1