

安装 数据加载 数据探索(EDA) 训练预测模型 模型评价 模型解释
import pandas as pdimport warningswarnings.filterwarnings('ignore')import logginglogging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)from ads.dataset.factory import DatasetFactory

flights = DatasetFactory.open("/home/datascience/ads-examples/3P_data/flights.csv")flights.head()

DepDelay设定为预测变量
DepDelay"DepDelay"以外的其他变量。通常我们表示为“X”
flights = flights.set_target('DepDelay')
show_in_notebook()函数,可轻松确认数据分布情况,并对缺失数据、不均衡数据等问题进行自动化处理,无需单独写代码。
flights.show_in_notebook()




flights_nocheat = flights.drop_columns(["ArrDelay", "ArrTime", "DepTime","AirTime", "ActualElapsedTime"]).sample(frac=0.1)
flights_nocheat.target.show_in_notebook(feature_names=["DayOfWeek", "DayofMonth","UniqueCarrier"])



type(flights_nocheat)# RegressionDataset
# 转换前show_in_notebook() 默认显示为 Histogram Graphflights_nocheat.target.show_in_notebook()

flights_binary = flights_nocheat.assign_column("DepDelay", lambda x: x is None or x>15)type(flights_binary)# BinaryClassificationDataset
# 数据转换后, show_in_notebook() 默认显示为Count Plot Graphflights_binary.target.show_in_notebook()

删除常数和主键字段 缺失值自动填补 删除线性相关的自变量 非均衡数据自动处理
flights_binary.get_recommendations()

flights_clean = flights_binary.auto_transform(fix_imbalance=False)
flights_clean.show_in_notebook()
[len(flights_binary.columns), len(flights_clean.columns)]

# 相关性Top12变量选择modeling_dataset = flights_clean.select_best_features(k=12)print(modeling_dataset.columns.values)

import h2oimport osfrom h2o.automl import H2OAutoML
# H2O Proxy 初始化http_proxy = os.environ.get("http_proxy", "")HTTP_PROXY = os.environ.get("HTTP_PROXY", "")os.environ["HTTP_PROXY"] = ""os.environ["http_proxy"] = ""h2o.init()

h2o_df = modeling_dataset.to_h2o_dataframe()h2o_df.head(5)

train, valid, test = h2o_df.split_frame(ratios=list([.7, .15]), seed = 1)[train.shape, valid.shape, test.shape]

x = train.columnsy = "DepDelay"x.remove(y)
%%timeh2o_aml = H2OAutoML(max_models=20, seed=1, max_runtime_secs=100)h2o_aml.train(x=x, y=y, training_frame=train, validation_frame=valid)h2o_model = h2o_aml.leader

print("\nH2O Accuracy: ", (h2o_model.predict(test)[0] == test["DepDelay"]).sum() / test.shape[0])

os.environ["http_proxy"] = http_proxyos.environ["HTTP_PROXY"] = HTTP_PROXY
from ads.automl.provider import OracleAutoMLProvider
train, valid, test = modeling_dataset.train_validation_test_split(test_size=0.1, validation_size=0.2, random_state=42)[train.X.shape, valid.X.shape, test.X.shape]

回归预测中,ZeroR基准模型会预测训练数据集中的平均值 分类预测中,ZeroR基准模型会预测训练数据集中最频繁出现的值
选择最佳的变量 最少的Sampling 选择最佳的算法 选择最佳的参数
olabs_automl = OracleAutoMLProvider()
import lightgbm
%%timefrom ads.automl.driver import AutoMLautoml = AutoML(train, validation_data=valid,provider=olabs_automl)olabs_model, baseline = automl.train(min_features=["CRSDepTime", "TaxiOut"], model_list=["LogisticRegression", "LGBMClassifier", "XGBClassifier", "RandomForestClassifier"], random_state = 42, score_metric = 'roc_auc')


print("\nOLabs Score: ", olabs_model.score(test.X, test.y))

olabs_model.selected_model_params_

olabs_model.num_total_trials_

olabs_model.summary()

automl.visualize_feature_selection_trials()

automl.visualize_tuning_trials()

olabs_model.visualize_transforms()


from ads.evaluations.evaluator import ADSEvaluatorfrom ads.common.data import MLDataevaluator = ADSEvaluator(test, models=[baseline, olabs_model])evaluator.show_in_notebook()





Explainability: 对模型预测的依据的解释和理解 Interpretability: 能够理解模型解释结果的人的水准 Global Explanations: 对模型本身的理解 local Explanations: 对特定数据集预测结果的解释 Model-Agnostic Explanations: 对模型训练过程以及变量的选择过程理解为黑盒子,不进行解释
from ads.explanations.explainer import ADSExplainerfrom ads.explanations.mlx_global_explainer import MLXGlobalExplainer
olabs_explainer = ADSExplainer(test, olabs_model)global_explainer = olabs_explainer.global_explanation(provider=MLXGlobalExplainer())
global_explainer.feature_importance_summary()



feature_importance = global_explainer.compute_feature_importance()
feature_importance.get_global_explanation()

文章转载自火火日记,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




