Python分析电信运营商客户流失
企业通过客户消费获利,拥有大量的客户是一个企业成功的必然,在实际生活中,企业获取客户之后,客户却不是只在一家企业进行消费,因而企业客户量存在一个流失的比率,降低客户的流失率也因此成为学者和企业的研究关注重点。在运营商行业客户流失问题也成为了研究焦点,本项目研究中获取到电信运营商的部分客户流失数据,通过挖掘数据中的信息分析出数个结论,最后构建预测模型预测用户流失的可能性,运营商一方面可以通过客户在与特征上表现预估客户流失可能同时针对相应特征进行保留,另一方面也可以直接通过预测模型预测客户流失可能再对比客户特征制定个性化的保留策略。
本文来自Telco Churn Management Handbook英文原版,中文译名《电信业客户流失管理》,作者ROB MATTISON(罗布·马蒂森)。 是电信客户流失方面分析的经典。
分析开始
导入库
#导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'SimHei'
plt.rcParams['axes.unicode_minus']=False
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv(r'/share/datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv')
# 设置查看列不省略
pd.set_option('display.max_columns',None)
data.head(10)
数据理解
# 查看数据集大小
data.shape
本数据集描述了电信用户是否流失以及其相关信息,共包含7043条数据,共21个字段,分别介绍如下:
- customerID : 用户ID。
- gender:性别。(Female & Male)
- SeniorCitizen :老年用户 (1表示是,0表示不是)
- Partner :伴侣用户 (Yes or No)
- Dependents :亲属用户 (Yes or No)
- tenure : 在网时长(0-72月)
- PhoneService : 是否开通电话服务业务 (Yes or No)
- MultipleLines: 是否开通了多线业务(Yes 、No or No phoneservice 三种)
- InternetService:是否开通互联网服务 (No, DSL数字网络,fiber optic光纤网络 三种)
- OnlineSecurity:是否开通网络安全服务(Yes,No,No internetserive 三种)
- OnlineBackup:是否开通在线备份业务(Yes,No,No internetserive 三种)
- DeviceProtection:是否开通了设备保护业务(Yes,No,No internetserive 三种)
- TechSupport:是否开通了技术支持服务(Yes,No,No internetserive 三种)
- StreamingTV:是否开通网络电视(Yes,No,No internetserive 三种)
- StreamingMovies:是否开通网络电影(Yes,No,No internetserive 三种)
- Contract:签订合同方式 (按月,一年,两年)
- PaperlessBilling:是否开通电子账单(Yes or No)
- PaymentMethod:付款方式(bank transfer,credit card,electronic check,mailed check)
- MonthlyCharges:月费用
- TotalCharges:总费用
- Churn:该用户是否流失(Yes or No)
数据一共21列,进行归纳梳理,分用户画像指标,消费产品指标,消费信息指标
- (1)用户画像指标:
人口统计指标:'gender','SeniorCitizen','Partner','Dependents',
用户活跃度: 'tenure'
- (2)消费产品指标:
手机服务: 'PhoneService', 'MultipleLines',
网络服务: 'InternetService' ,'OnlineSecurity','OnlineBackup','DeviceProtection', 'TechSupport','StreamingTV'
- (3)消费信息指标:
收费:'MonthlyCharges','TotalCharges'
收入相关指标:'Contract', 'PaperlessBilling', 'PaymentMethod'
# 查看数据类型
data.info()
# Null计数
pd.isnull(data).sum()
def cal_churn(col):
result = {}
for i in data[col].unique():
result[i] = data[(data[col]==i)&(data['Churn']==1)].shape[0] / data[(data[col]==i)].shape[0] #流失的在总体数据中的占比
return result
data['Churn'].replace({'No':0, 'Yes':1},inplace=True) #转换之后进行计算/
churn_df = data[data['Churn'] == 1] #已流失
#customerID数据可以删除,有17个特征时object后面需要转化,TotalCharges有一个奇特的现象,空值为' '需要处理
data['TotalCharges'].replace(' ', 0, inplace=True) #用0代替空值
data['TotalCharges'] = data['TotalCharges'].astype('float') #再将数据类型转为float
data['TotalCharges'].replace(0, data['TotalCharges'].median(), inplace=True) #再用中位数代替0值
#流失人数
sns.countplot(data['Churn'])
plt.show()
#流失率和人口统计指标的关系
fig= plt.figure(figsize=(8, 8))
for i,j in enumerate(['gender','SeniorCitizen','Partner','Dependents']):
ax1 = fig.add_subplot(2, 2, i+1)
plot_data1 = data[j].value_counts() #画出总体数据
plot_data2 = churn_df[j].value_counts() #画出流失数据
ax = sns.countplot(x='Churn', hue=j, data=data)
#计算占比
j_churn = cal_churn(j)
j_factor = data[j].unique()
if len(j_factor) == 2:
start, span = 0.1, 1
elif len(j_factor) == 3:
start, span = 0.02, 1
else:
span =0
for k, n in enumerate(j_factor):
ax.annotate(s='%s'%round(j_churn[n], 2), xy=(start+k*span, 50))
ax1.set_title(j)
plt.show()
很明显地发现,老人、单身、没亲属的特征流失率较高
fig= plt.figure(figsize=(16, 16))
for i,j in enumerate(['PhoneService', 'MultipleLines', 'InternetService' ,'OnlineSecurity','OnlineBackup','DeviceProtection', 'TechSupport','StreamingTV','StreamingMovies']):
ax1 = fig.add_subplot(3, 3, i+1)
plot_data1 = data[j].value_counts()
plot_data2 = churn_df[j].value_counts()
ax = sns.countplot(x=j, hue='Churn', data=data)
j_churn = cal_churn(j)
j_factor = data[j].unique()
if len(j_factor) == 2:
start,span = 0.1,1
elif len(j_factor) == 3:
start,span=0.02,1
else:
span =0
for k,n in enumerate(j_factor):
ax.annotate(s='%s'%round(j_churn[n],2), xy=(start+k*span,50))
ax1.set_title(j)
plt.show()
fig= plt.figure(figsize=(16, 16))
for i,j in enumerate(['PhoneService', 'MultipleLines', 'InternetService' ,'OnlineSecurity','OnlineBackup','DeviceProtection', 'TechSupport','StreamingTV','StreamingMovies']):
ax1 = fig.add_subplot(3, 3, i+1)
plot_data1 = data[j].value_counts()
plot_data2 = churn_df[j].value_counts()
ax = sns.countplot(x='Churn', hue=j, data=data)
ax1.set_title(j)
plt.show()
fig = plt.figure(figsize=(8, 8))
ax1 = plt.subplot(2, 2, 1)
ax1.hist(churn_df.MonthlyCharges.values, bins=20, alpha=0.7)
ax1.set_title("流失客户月消费分布图")
ax2 = plt.subplot(2,2,2)
ax2.hist(data.MonthlyCharges.values, bins=20, alpha=0.7)
ax2.set_title("整体客户月消费分布图")
ax3 = plt.subplot(2,2,3)
ax3.hist(churn_df.TotalCharges.values, bins=20, alpha=0.7, color='r')
ax3.set_title("流失客户总消费分布图")
ax4 = plt.subplot(2,2,4)
ax4.hist(data.TotalCharges.values, bins=20, alpha=0.7, color='r')
ax4.set_title("整体客户总消费分布图")
plt.show()
最后总结高流失人群特征:
用户维度:老人,单身,无亲属,在网时长小于24个月
产品维度:开通多线服务,开通光纤网络,不开通技术性增值服务
消费特征:月消费60-100元,选择月签,电子支付
流失降低措施
def hq_user_payment_bar(col, condition):
hq_df = data[(data[col]==condition)&(data['tenure']>24)&(data['Churn']==0)&(data['MonthlyCharges']>60)].sort_values('tenure', ascending=False)
fig= plt.figure(figsize=(15,25))
for i,j in enumerate(['Contract', 'PaperlessBilling', 'PaymentMethod']):
ax1 = fig.add_subplot(4,3,i+1)
ax1 = sns.countplot(x='Churn', hue=j, data= hq_df)
ax1.set_title(j)
def hq_user_service_score(col, condition):
hq_df = data[(data[col]==condition)&(data['tenure']>24)&(data['Churn']==0)&(data['MonthlyCharges']>60)].sort_values('tenure', ascending=False)
fig= plt.figure(figsize=(15, 25))
result_dic = {}
for i in ['PhoneService', 'MultipleLines', 'InternetService' ,'OnlineSecurity','OnlineBackup','DeviceProtection', 'TechSupport','StreamingTV','StreamingMovies', 'PaperlessBilling']:
if i == "InternetService":
for j in hq_df['InternetService'].unique():
result_dic[j] = hq_df[hq_df[i]==j].shape[0]/hq_df.shape[0]
else:
result_dic[i] = hq_df[hq_df[i]=="Yes"].shape[0]/hq_df.shape[0]
return result_dic
def hq_user_service_radar(col,condition):
score_result = hq_user_service_score(col,condition)
labels = np.array(list(score_result.keys()))
stats = np.array(list(score_result.values()))
angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False)
stats = np.concatenate((stats, [stats[0]]))
angles = np.concatenate((angles, [angles[0]]))
fig = plt.figure(figsize=(6,10))
ax = fig.add_subplot(111, polar=True)
ax.plot(angles, stats, 'o-', linewidth=2)
ax.fill(angles, stats, alpha=0.25)
ax.set_thetagrids(angles * 180/np.pi, labels)
ax.set_rlabel_position(0.20)
ax.set_title("订购服务雷达图")
在高流失用户群(老人/单身/无亲属/月签/电子支付)寻找在网时长高(大于24个月),月消费高(大于60),未流失的高质量用户,挖掘其消费模型,进而推广至该用户群其他用户。
老年高质量用户绝大多数选择开通手机服务,光纤网络的需求远大于DSL,对OBP,DPT,STV,STM有较强的需求,对OSY和TST比较弱,合同方式偏向选择短期,无纸账单,很少用户选择mail支付.
hq_user_service_radar('SeniorCitizen',1)
hq_user_payment_bar('SeniorCitizen',1)
单身高质量用户绝大多数选择开通手机服务,对网络服务需求反倒一般,对技术和娱乐增值服务需求也一般,合同方式三种差异不大,偏向无纸账单,非mail支付方式.
hq_user_service_radar('Partner','No')
hq_user_payment_bar('Partner','No')
无亲属高质量用户会开通手机服务和网络服务,对OBP,DPT需求提升,合同方式选择差异不大,支付方式大多选择非mail另外三种一种。
hq_user_service_radar('Dependents','No')
hq_user_payment_bar('Dependents','No')
月签高质量用户手机服务是基本需求,多线需求较强,光纤需求远大于DSL,对技术性和娱乐性增值服务需求都偏弱,大多数选择无纸账单,偏向选择电子支付
hq_user_service_radar('Contract','Month-to-month')
hq_user_payment_bar('Contract','Month-to-month')
数据处理
class_data = data #取出数据训练模型
class_data = class_data.drop(['customerID'], axis=1) #剔除customerID
class_data['gender'].replace({"Female":0,"Male":1},inplace=True)
class_data['Partner'].replace({"No":0,"Yes":1},inplace=True)
class_data['Dependents'].replace({"No":0,"Yes":1},inplace=True)
class_data['PhoneService'].replace({"No":0,"Yes":1},inplace=True)
class_data['MultipleLines'].replace({"No":0,"Yes":1,"No phone service":2},inplace=True)
class_data['InternetService'].replace({"No":0,"Fiber optic":1,"DSL":2},inplace=True)
class_data['OnlineSecurity'].replace({"No":0,"Yes":1,"No internet service":2},inplace=True)
class_data['OnlineBackup'].replace({"No":0,"Yes":1,"No internet service":2},inplace=True)
class_data['DeviceProtection'].replace({"No":0,"Yes":1,"No internet service":2},inplace=True)
class_data['TechSupport'].replace({"No":0,"Yes":1,"No internet service":2},inplace=True)
class_data['StreamingTV'].replace({"No":0,"Yes":1,"No internet service":2},inplace=True)
class_data['StreamingMovies'].replace({"No":0,"Yes":1,"No internet service":2},inplace=True)
class_data['Contract'].replace({"Month-to-month":0,"Two year":1,"One year":2},inplace=True)
class_data['PaperlessBilling'].replace({"No":0,"Yes":1},inplace=True)
class_data['PaymentMethod'].replace({"Electronic check":0,"Mailed check":1,"Bank transfer (automatic)":2,"Credit card (automatic)":3},inplace=True)
plt.figure(figsize=(8, 4))
corr_matrix = data.corr()
corr_matrix['Churn'].sort_values(ascending=False).plot(kind='bar')
plt.show()
算法选择
labels = class_data['Churn'] #去除labels
class_data = class_data.drop(['gender', 'PhoneService', 'Churn'], axis=1) #剔除部分特征
from sklearn.model_selection import train_test_split
#3:7划分数据
train_x, test_x, train_y, test_y = train_test_split(class_data, labels,test_size=0.3, random_state=42)
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# 构造分类器
clfs = [
SVC(),
GaussianNB(),
KNeighborsClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
KNeighborsClassifier(),
AdaBoostClassifier(),
]
# 分类器名称
clf_name = [
'svc',
'gaussiannbclassifier',
'KNNclassifier',
'decisiontreeclassifier',
'randomforestclassifier',
'kneighborsclassifier',
'adaboostclassifier',
]
def train_model(clf, train_x, train_y, test_x, test_y):
clf.fit(train_x, train_y)
clf_score = clf.score(test_x, test_y)
print("模型:", clf)
print("准确率 %0.4lf" % clf_score)
for model, model_name in zip(clfs, clf_name):
result = train_model(model, train_x, train_y, test_x, test_y)
ada_clf = AdaBoostClassifier(random_state=42)
ada_param = [{'n_estimators':[30, 60, 90], 'learning_rate':[0.1, 1, 10]}]
ada_search = GridSearchCV(ada_clf, ada_param, cv=5, scoring='accuracy', n_jobs=-1)
ada_search.fit(train_x, train_y)
from sklearn import metrics
pre_y = ada_search.best_estimator_.predict(test_x)
accuracy_score = metrics.accuracy_score(test_y, pre_y)
accuracy_score
总结
在实际业务分析中,业务理解和数据理解为主要环节,通过业务理解将商务问题转换为数据问题,通过数据理解剖析数据中各个变量的关系、业务目标的主要影响因素,有针对性地提出解决方法。随着算法计算的普及,使用算法进行大规模批量运算进行预测分析大大节省了人力,同时在进行算法分析时数据的处理也是一个重要问题,这就将现代数据挖掘问题联系起来。
建议使用PC或笔记本电脑,浏览器使用Chrome或FireFox进行浏览,以开启左侧互动实验区来提升学习效率,推荐使用的分辨率为1920x1080或更高。
我们坚信最好的学习是参与其中这一理念,并致力成为中文互联网上体验更好的学练一体的IT技术学习交流平台。

您可加QQ群:575806994,一起学习交流技术,反馈网站使用中遇到问题。
内容、课程、广告等相关合作请扫描右侧二维码添加好友。
狐狸教程 Copyright 2021