# 1, Characteristic Engineering

```#Core code example

# Statistical characteristics
#Calculated mean value
gp = train.groupby(by)[fea].mean()
#Calculate median
gp = train.groupby(by)[fea].median()
#Variance of computation
gp = train.groupby(by)[fea].std()
#Calculate maximum
gp = train.groupby(by)[fea].max()
#Calculate minimum
gp = train.groupby(by)[fea].min()
#Calculate the number of occurrences
gp = train.groupby(by)[fea].size()

# Statistical characteristics of group by generation: mean,std
# Calculate the mean and variance of the area according to the community name group
temp = data.groupby('communityName')['area'].agg({'com_area_mean': 'mean', 'com_area_std': 'std'})

# Feature splitting
# Change houseType to 'Room', 'Hall', 'Bath'
def Room(x):
Room = int(x.split('room')[0])
return Room
def Hall(x):
Hall = int(x.split("room")[1].split("office")[0])
return Hall
def Bath(x):
Bath = int(x.split("room")[1].split("office")[1].split("Wei")[0])
return Bath

data['Room'] = data['houseType'].apply(lambda x: Room(x))
data['Hall'] = data['houseType'].apply(lambda x: Hall(x))
data['Bath'] = data['houseType'].apply(lambda x: Bath(x))

#Feature merging
# Characteristics of combined supporting facilities
data['trainsportNum'] = 5 * data['subwayStationNum'] / data['subwayStationNum'].mean() + data['busStationNum'] / \
data[
'busStationNum'].mean()

# Cross generation feature: cross between features + - */
data['Room_Bath'] = (data['Bath']+1) / (data['Room']+1)

# Clustering feature
from sklearn.mixture import GaussianMixture  Use GaussianMixture Make clustering features
gmm = GaussianMixture(n_components=4, covariance_type='full', random_state=0)
gmm.fit_predict(data)

# Feature coding
from sklearn.preprocessing import LabelEncoder
data['communityName'] = LabelEncoder().fit_transform(data['communityName'])
from sklearn import preprocessing.OneHotEncoder
data['communityName'] = OneHotEncoder().fit_transform(data['communityName'])

# Too many level values take log smoothing (valid for linear model)
data[feature]=np.log1p(data[feature])
```
```import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder

```

## 1.1 combination of features

```def newfeature(data):

# Change houseType to 'Room', 'Hall', 'Bath'
def Room(x):
Room = int(x.split('room')[0])
return Room
def Hall(x):
Hall = int(x.split("room")[1].split("office")[0])
return Hall
def Bath(x):
Bath = int(x.split("room")[1].split("office")[1].split("Wei")[0])
return Bath

data['Room'] = data['houseType'].apply(lambda x: Room(x))
data['Hall'] = data['houseType'].apply(lambda x: Hall(x))
data['Bath'] = data['houseType'].apply(lambda x: Bath(x))
data['Room_Bath'] = (data['Bath']+1) / (data['Room']+1)
# Fill in rental type
data.loc[(data['rentType'] == 'Unknown way') & (data['Room'] <= 1), 'rentType'] = 'Whole rent'
# print(data.loc[(data['rentType '] = =' unknown method ') & (data ['room_bath'] > 1),'renttype '])
data.loc[(data['rentType'] == 'Unknown way') & (data['Room_Bath'] > 1), 'rentType'] = 'Rent sharing'
data.loc[(data['rentType'] == 'Unknown way') & (data['Room'] > 1) & (data['area'] < 50), 'rentType'] = 'Rent sharing'
data.loc[(data['rentType'] == 'Unknown way') & (data['area'] / data['Room'] < 20), 'rentType'] = 'Rent sharing'
# data.loc[(data['rentType '] =' unknown method ') & (data ['area'] > 60),'renttype '] =' joint lease '
data.loc[(data['rentType'] == 'Unknown way') & (data['area'] <= 50) & (data['Room'] == 2), 'rentType'] = 'Rent sharing'
data.loc[(data['rentType'] == 'Unknown way') & (data['area'] > 60) & (data['Room'] == 2), 'rentType'] = 'Whole rent'
data.loc[(data['rentType'] == 'Unknown way') & (data['area'] <= 60) & (data['Room'] == 3), 'rentType'] = 'Rent sharing'
data.loc[(data['rentType'] == 'Unknown way') & (data['area'] > 60) & (data['Room'] == 3), 'rentType'] = 'Whole rent'
data.loc[(data['rentType'] == 'Unknown way') & (data['area'] >= 100) & (data['Room'] > 3), 'rentType'] = 'Whole rent'

# data.drop('Room_Bath', axis=1, inplace=True)
def month(x):
month = int(x.split('/')[1])
return month
# def day(x):
#     day = int(x.split('/')[2])
#     return day
# Result variation

# Split transaction time
# data['day'] = data['tradeTime'].apply(lambda x: day(x))# Result variation
#     data['pv/uv'] = data['pv'] / data['uv']
#     Data ['total number of rooms'] = data [' room '] + data [' Hall '] + data [' bathroom ']

# Characteristics of combined supporting facilities
data['trainsportNum'] = 5 * data['subwayStationNum'] / data['subwayStationNum'].mean() + data['busStationNum'] / \
data[
'busStationNum'].mean()
data['all_SchoolNum'] = 2 * data['interSchoolNum'] / data['interSchoolNum'].mean() + data['schoolNum'] / data[
'schoolNum'].mean() \
+ data['privateSchoolNum'] / data['privateSchoolNum'].mean()
data['all_hospitalNum'] = 2 * data['hospitalNum'] / data['hospitalNum'].mean() + \
data['drugStoreNum'] / data['drugStoreNum'].mean()
data['all_mall'] = data['mallNum'] / data['mallNum'].mean() + \
data['superMarketNum'] / data['superMarketNum'].mean()
data['otherNum'] = data['gymNum'] / data['gymNum'].mean() + data['bankNum'] / data['bankNum'].mean() + \
data['shopNum'] / data['shopNum'].mean() + 2 * data['parkNum'] / data['parkNum'].mean()

data.drop(['subwayStationNum', 'busStationNum',
'interSchoolNum', 'schoolNum', 'privateSchoolNum',
'hospitalNum', 'drugStoreNum', 'mallNum', 'superMarketNum', 'gymNum', 'bankNum', 'shopNum', 'parkNum'],
axis=1, inplace=True)

#     data['houseType_1sumcsu']=data['Bath'].map(lambda x:str(x))+data['month'].map(lambda x:str(x))
#     data['houseType_2sumcsu']=data['Bath'].map(lambda x:str(x))+data['communityName']
#     data['houseType_3sumcsu']=data['Bath'].map(lambda x:str(x))+data['plate']

data.drop('houseType', axis=1, inplace=True)

data["area"] = data["area"].astype(int)

# categorical_feats = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName','region', 'plate']
categorical_feats = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration',  'region', 'plate','cluster']

return data, categorical_feats
```

## 1.2 statistical characteristics of calculation

```#Statistical characteristics of calculation
def featureCount(train,test):
train['data_type'] = 0
test['data_type'] = 1
data = pd.concat([train, test], axis=0, join='outer')
def feature_count(data, features=[]):
new_feature = 'count'
for i in features:
new_feature += '_' + i
temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
data = data.merge(temp, 'left', on=features)
return data

data = feature_count(data, ['communityName'])
data = feature_count(data, ['buildYear'])
data = feature_count(data, ['totalFloor'])
data = feature_count(data, ['communityName', 'totalFloor'])
data = feature_count(data, ['communityName', 'newWorkers'])
new_train = data[data['data_type'] == 0]
new_test = data[data['data_type'] == 1]
new_train.drop('data_type', axis=1, inplace=True)
new_test.drop(['data_type'], axis=1, inplace=True)
return new_train, new_test

train, test = featureCount(train, test)
```

## 1.3 statistical characteristics generated by group by method

```#Statistical characteristics of group by generation: mean,std, etc

def gourpby(train,test):
train['data_type'] = 0
test['data_type'] = 1
data = pd.concat([train, test], axis=0, join='outer')
columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
for feature in columns:
data[feature] = LabelEncoder().fit_transform(data[feature])

temp = data.groupby('communityName')['area'].agg({'com_area_mean': 'mean', 'com_area_std': 'std'})
temp.fillna(0, inplace=True)
data = data.merge(temp, on='communityName', how='left')

data['price_per_area'] = data.tradeMeanPrice / data.area * 100
temp = data.groupby('communityName')['price_per_area'].agg(
{'comm_price_mean': 'mean', 'comm_price_std': 'std'})
temp.fillna(0, inplace=True)
data = data.merge(temp, on='communityName', how='left')

temp = data.groupby('plate')['price_per_area'].agg(
{'plate_price_mean': 'mean', 'plate_price_std': 'std'})
temp.fillna(0, inplace=True)
data = data.merge(temp, on='plate', how='left')
data.drop('price_per_area', axis=1, inplace=True)

temp = data.groupby('plate')['area'].agg({'plate_area_mean': 'mean', 'plate_area_std': 'std'})
temp.fillna(0, inplace=True)
data = data.merge(temp, on='plate', how='left')

temp = data.groupby(['plate'])['buildYear'].agg({'plate_year_mean': 'mean', 'plate_year_std': 'std'})
data = data.merge(temp, on='plate', how='left')
data.plate_year_mean = data.plate_year_mean.astype('int')
data['comm_plate_year_diff'] = data.buildYear - data.plate_year_mean
data.drop('plate_year_mean', axis=1, inplace=True)

temp = data.groupby('plate')['trainsportNum'].agg('sum').reset_index(name='plate_trainsportNum')
data = data.merge(temp, on='plate', how='left')
temp = data.groupby(['communityName', 'plate'])['trainsportNum'].agg('sum').reset_index(name='com_trainsportNum')
data = data.merge(temp, on=['communityName', 'plate'], how='left')
data['trainsportNum_ratio'] = list(map(lambda x, y: round(x / y, 3) if y != 0 else -1,
data['com_trainsportNum'], data['plate_trainsportNum']))
data = data.drop(['com_trainsportNum', 'plate_trainsportNum'], axis=1)

temp = data.groupby('plate')['all_SchoolNum'].agg('sum').reset_index(name='plate_all_SchoolNum')
data = data.merge(temp, on='plate', how='left')
temp = data.groupby(['communityName', 'plate'])['all_SchoolNum'].agg('sum').reset_index(name='com_all_SchoolNum')
data = data.merge(temp, on=['communityName', 'plate'], how='left')
data = data.drop(['com_all_SchoolNum', 'plate_all_SchoolNum'], axis=1)

temp = data.groupby(['communityName', 'plate'])['all_mall'].agg('sum').reset_index(name='com_all_mall')
data = data.merge(temp, on=['communityName', 'plate'], how='left')

temp = data.groupby('plate')['otherNum'].agg('sum').reset_index(name='plate_otherNum')
data = data.merge(temp, on='plate', how='left')
temp = data.groupby(['communityName', 'plate'])['otherNum'].agg('sum').reset_index(name='com_otherNum')
data = data.merge(temp, on=['communityName', 'plate'], how='left')
data['other_ratio'] = list(map(lambda x, y: round(x / y, 3) if y != 0 else -1,
data['com_otherNum'], data['plate_otherNum']))
data = data.drop(['com_otherNum', 'plate_otherNum'], axis=1)

temp = data.groupby(['month', 'communityName']).size().reset_index(name='communityName_saleNum')
data = data.merge(temp, on=['month', 'communityName'], how='left')
temp = data.groupby(['month', 'plate']).size().reset_index(name='plate_saleNum')
data = data.merge(temp, on=['month', 'plate'], how='left')

data['sale_ratio'] = round((data.communityName_saleNum + 1) / (data.plate_saleNum + 1), 3)
data['sale_newworker_differ'] = 3 * data.plate_saleNum - data.newWorkers
data.drop(['communityName_saleNum', 'plate_saleNum'], axis=1, inplace=True)

new_train = data[data['data_type'] == 0]
new_test = data[data['data_type'] == 1]
new_train.drop('data_type', axis=1, inplace=True)
new_test.drop(['data_type'], axis=1, inplace=True)
return new_train, new_test

train, test = gourpby(train, test)
```

## 1.4 clustering method

```#clustering
def cluster(train,test):
from sklearn.mixture import GaussianMixture

train['data_type'] = 0
test['data_type'] = 1
data = pd.concat([train, test], axis=0, join='outer')
col = ['totalFloor',
'houseDecoration', 'communityName', 'region', 'plate', 'buildYear',

'landTotalPrice', 'landMeanPrice', 'totalWorkers',
'newWorkers', 'residentPopulation', 'lookNum',
'trainsportNum',
'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']

# EM
gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=0)
data['cluster']= pd.DataFrame(gmm.fit_predict(data[col]))

col1 = ['totalFloor','houseDecoration', 'communityName', 'region', 'plate', 'buildYear']
'landTotalPrice', 'landMeanPrice', 'totalWorkers',
'newWorkers', 'residentPopulation', 'lookNum',
'trainsportNum',
'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']
for feature1 in col1:
for feature2 in col2:

temp = data.groupby(['cluster',feature1])[feature2].agg('mean').reset_index(name=feature2+'_'+feature1+'_cluster_mean')
temp.fillna(0, inplace=True)

data = data.merge(temp, on=['cluster', feature1], how='left')

new_train = data[data['data_type'] == 0]
new_test = data[data['data_type'] == 1]
new_train.drop('data_type', axis=1, inplace=True)
new_test.drop(['data_type'], axis=1, inplace=True)

return new_train, new_test

train, test = cluster(train, test)
```

## 1.5 log smoothing

```# Too many level values take log smoothing (valid for linear model)
'residentPopulation','pv','uv']
for col in big_num_cols:
train[col] = train[col].map(lambda x: np.log1p(x))
test[col] = test[col].map(lambda x: np.log1p(x))

```
```#Comparison of linear model results before and after Feature Engineering
test=test.fillna(0)
# Lasso regression
from sklearn.linear_model import Lasso
lasso=Lasso(alpha=0.1)
lasso.fit(train,target_train)
#Predict test set and training set results
y_pred_train=lasso.predict(train)
y_pred_test=lasso.predict(test)

#Contrast result
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("Training set results:",score_train)
score_test=r2_score(y_pred_test, target_test)
print("Test set results:",score_test)
```

# 2, Feature selection

```import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder

```

## 2.1 correlation coefficient method

```#Feature selection of correlation coefficient method
from sklearn.feature_selection import SelectKBest

print(train.shape)

sk=SelectKBest(k=150)
new_train=sk.fit_transform(train,target_train)
print(new_train.shape)

# Get corresponding column index
select_columns=sk.get_support(indices = True)
# print(select_columns)

# Get the corresponding column name
# print(test.columns[select_columns])
select_columns_name=test.columns[select_columns]
new_test=test[select_columns_name]
print(new_test.shape)
# Lasso regression
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#Predict test set and training set results
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#Contrast result
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("Training set results:",score_train)
score_test=r2_score(y_pred_test, target_test)
print("Test set results:",score_test)
```

## 2.2 Wrapper

```# Wrapper

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=160)
rfe.fit(train,target_train)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False),
n_features_to_select=40, step=1, verbose=0)

select_columns = [f for f, s in zip(train.columns, rfe.support_) if s]
print(select_columns)
new_train = train[select_columns]
new_test = test[select_columns]

# Lasso regression
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#Predict test set and training set results
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#Contrast result
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("Training set results:",score_train)
score_test=r2_score(y_pred_test, target_test)
print("Test set results:",score_test)
```

## 2.3 Embedded

Feature selection method based on penalty terms: Lasso(l1) and Ridge(l2)

```# Embedded
# Feature selection method based on penalty term
# Lasso(l1) and Ridge(l2)

from sklearn.linear_model import Ridge

ridge = Ridge(alpha=5)
ridge.fit(train,target_train)

Ridge(alpha=5, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
random_state=None, solver='auto', tol=0.001)

# Sorting of characteristic coefficients
coefSort = ridge.coef_.argsort()
print(coefSort)

# Characteristic coefficient
featureCoefSore=ridge.coef_[coefSort]
print(featureCoefSore)

select_columns = [f for f, s in zip(train.columns, featureCoefSore) if abs(s)> 0.0000005 ]
# Select features with an absolute value greater than 0.0000005

new_train = train[select_columns]
new_test = test[select_columns]
# Lasso regression
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#Predict test set and training set results
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#Contrast result
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("Training set results:",score_train)
score_test=r2_score(y_pred_test, target_test)
print("Test set results:",score_test)
```

## 2.4 feature selection method based on tree model

Mean decrease improvement of random forest

```# Embedded
# Feature selection method based on tree model
# Mean decrease purity of random forest

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
# The random forest model is trained, and the importance score of each feature is obtained through the feature ﹣ imports ﹣ attribute. rf = RandomForestRegressor()
rf.fit(train,target_train)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), train.columns),
reverse=True))

select_columns = [f for f, s in zip(train.columns, rf.feature_importances_) if abs(s)> 0.00005 ]
# Select features with an absolute value greater than 0.00005

new_train = train[select_columns]
new_test = test[select_columns]

# Lasso regression
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#Predict test set and training set results
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#Contrast result
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("Training set results:",score_train)
score_test=r2_score(y_pred_test, target_test)
print("Test set results:",score_test)
```
84 original articles published, 34 praised, 10000 visitors+

Tags: Lambda Attribute

Posted on Sat, 11 Jan 2020 09:06:56 -0800 by jabba_29