fig, axs = plt.subplots(1,2)
axs[0].hist([a0,b0], bins=32, stacked=True, color=['blue', 'red'])
axs[1].hist([a2,b2], bins=32, stacked=True, color=['blue', 'red'])
axs[0].set_title("Search Non Saturday Night")
axs[1].set_title("Search Saturday Night")
plt.show();
# add the data to the main
df_class = pd.concat([df_class0, df_class1])
df['anomaly5'] = df_class['anomaly']
# df['anomaly5'] = np.array(df['anomaly22'] == -1).astype(int)
fig, ax = plt.subplots(figsize=(10, 6))
a = df.loc[df['anomaly5'] == -1, ('date_time_int', 'price_usd')] #anomaly
ax.plot(df['date_time_int'], df['price_usd'], color='blue', label='Normal')
ax.scatter(a['date_time_int'],a['price_usd'], color='red', label='Anomaly')
plt.legend()
plt.show();
有趣的是可以看到,以这种方式检测到的异常点只有异常的高价点而没有异常的低价点。
马尔可夫链的异常检测
我们需要对马尔可夫链定义状态下的数据点进行离散化。我们将使用'price_usd'来定义这个示例的状态,并定义5个级别的值(非常低、非常低、平均、非常高、非常高)/(VL、L、A、H、VH)。马尔可夫链可以表示为状态VL,L,L,A,A,H,H,VH。每个价格都是一种状态到另一种状态的价格。我们可以利用历史价格数据建立马尔可夫链,并用它来计算序列概率。然后,我们可以找到任何新序列发生的概率,然后标记为异常的罕见序列。
# train markov model to get transition matrix
defgetTransitionMatrix(df):
df = np.array(df)
model = msm.estimate_markov_model(df, 1)
returnmodel.transition_matrix
# return the success probability of the state change
defsuccessProbabilityMetric(state1, state2, transition_matrix):
proba = 0
fork inrange(0,len(transition_matrix)):
if(k != (state2-1)):
proba += transition_matrix[state1-1][k]
return1-proba
# return the success probability of the whole sequence
defsucessScore(sequence, transition_matrix):
proba = 0
fori inrange(1,len(sequence)):
if(i == 1):
proba = successProbabilityMetric(sequence[i-1], sequence[i], transition_matrix)
else:
proba = proba*successProbabilityMetric(sequence[i-1], sequence[i], transition_matrix)
returnproba
# return if the sequence is an anomaly considering a threshold
defanomalyElement(sequence, threshold, transition_matrix):
if(sucessScore(sequence, transition_matrix) > threshold):
return0
else:
return1
# return a dataframe containing anomaly result for the whole dataset