数据分析05

11.线性模拟

  1. 线性预测
  2. 线性拟合
    kx + b = y
    kx1 + b = y1
    kx2 + b = y2
    ...
    kxn + b = yn
    / x1 1 \    / k \    / y1 \
    | x2 1  | X | b | = | y2  |
    | ...      |     \    /    | ...   |
    \ xn 1/                 \ yn /
        a             x           b
                       = np.linalg.lstsq(a, b)[0]
    y = kx + b
    kx1 + b = y1' - y1
    kx2 + b = y2' - y2
    ...
    kxn + b = yn' - yn
    [y1 - (kx1 + b)]^2 +
    [y2 - (kx2 + b)]^2 + ... +
    [yn - (kxn + b)]^2 = loss = f(k, b)
    k, b? -> loss ->min
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md
    
    
    def dmy2ymd(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(
            dmy, '%d-%m-%Y').date()
        ymd = date.strftime('%Y-%m-%d')
        return ymd
    
    
    dates, opening_prices, highest_prices, \
        lowest_prices, closing_prices = np.loadtxt(
            '../../data/aapl.csv', delimiter=',',
            usecols=(1, 3, 4, 5, 6), unpack=True,
            dtype='M8[D], f8, f8, f8, f8',
            converters={1: dmy2ymd})
    trend_points = (highest_prices + lowest_prices +
                    closing_prices) / 3
    spreads = highest_prices - lowest_prices
    resistance_points = trend_points + spreads
    support_points = trend_points - spreads
    days = dates.astype(int)
    a = np.column_stack((days, np.ones_like(days)))
    x = np.linalg.lstsq(a, trend_points)[0]
    trend_line = days * x[0] + x[1]
    x = np.linalg.lstsq(a, resistance_points)[0]
    resistance_line = days * x[0] + x[1]
    x = np.linalg.lstsq(a, support_points)[0]
    support_line = days * x[0] + x[1]
    mp.figure('Trend', facecolor='lightgray')
    mp.title('Trend', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Price', fontsize=14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(md.WeekdayLocator(
        byweekday=md.MO))
    ax.xaxis.set_minor_locator(md.DayLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter(
        '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    rise = closing_prices - opening_prices >= 0.01
    fall = opening_prices - closing_prices >= 0.01
    fc = np.zeros(dates.size, dtype='3f4')
    ec = np.zeros(dates.size, dtype='3f4')
    fc[rise], fc[fall] = (1, 1, 1), (0.85, 0.85, 0.85)
    ec[rise], ec[fall] = (0.85, 0.85, 0.85), (0.85, 0.85, 0.85)
    mp.bar(dates, highest_prices - lowest_prices, 0,
           lowest_prices, color=fc, edgecolor=ec)
    mp.bar(dates, closing_prices - opening_prices, 0.8,
           opening_prices, color=fc, edgecolor=ec)
    mp.scatter(dates, trend_points, c='dodgerblue',
               alpha=0.5, s=60, zorder=2)
    mp.plot(dates, trend_line, c='dodgerblue',
            linewidth=3, label='Trend')
    mp.scatter(dates, resistance_points, c='orangered',
               alpha=0.5, s=60, zorder=2)
    mp.plot(dates, resistance_line, c='orangered',
            linewidth=3, label='Resistance')
    mp.scatter(dates, support_points, c='limegreen',
               alpha=0.5, s=60, zorder=2)
    mp.plot(dates, support_line, c='limegreen',
            linewidth=3, label='Support')
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()

12.裁剪,压缩和累乘

  • ndarray.clip(min=下限,max=上限)
    将调用数组中小于和大于下限和上限的元素替换为下限和上限,返回裁剪后的数组,调用数组保持不变
  • ndarray.compress(条件)
    返回由调用数组中满足条件的元素组成的新数组
  • ndarray.prod()
    返回调用数组中所有元素的乘积 ------累乘
  • ndarray.cumprod()
    返回调用数组中所有元素执行累乘的过程数组

代码

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
a = np.array([10, 20, 30, 40, 50])
print(a)
b = a.clip(min=15, max=45)
print(b)
c = a.compress((15 <= a) & (a <= 45))
print(c)
d = a.prod()
print(d)
e = a.cumprod()
print(e)


def jiecheng(n):
    return n if n == 1 else n * jiecheng(n - 1)

n = 5
print(jiecheng(n))
jc = 1
for i in range(2, n + 1):
    jc *= i
print(jc)
print(np.arange(2, n + 1).prod())

13.协方差,相关矩阵和相关系数

  • 样本:
    A = [a1,a2,...,an]
    B = [b1,b2,...,bn]
  • 平均值:
    ave_a = (a1+a2+...+an)/n
    ave_b = (b1+b2+...+bn)/m
  • 离差
    dev_a = [a1,a2,...,an]-ave_a
    dev_b = [b1,b2,...,bn]-ave_b
  • 方差
    var_a = ave(dev_a * dev_a)
    var_b = ave(dev_b * dev_b)
  • 标准差
    std_a = sqrt(var_a)
    std_b = sqrt(var_b)
    ...............................
  • 协方差
    cov_ab = ave(dev_a * dev_b)
    cov_ba = ave(dev_b * dev_a)
  • 相关系数
    cov_ab/(std_a * std_b) = cov_ba/(std_b * std_a)
    [-1 , 1]
    -1 <------------->0<------------->1
    反相关         不相关          正相关
    <-反相关性增强-   -相关性增强->
  • 相关矩阵
    /                                                                            \
    | var_a/(std_a x std_a)    cov_ab/(std_a x std_b) |
    | cov_ba/(std_b x std_a) var_b/(std_b x std_b)    |
    \                                                                            /
    /                               \
    |       1       相关系数 |
    | 相关系数      1        |
    \                               /
    numpy.cov(a, b)->相关矩阵的分子矩阵
    numpy.corrcoef(a,b) ->相关矩阵
    代码:corr.py
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md
    
    
    def dmy2ymd(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(
            dmy, '%d-%m-%Y').date()
        ymd = date.strftime('%Y-%m-%d')
        return ymd
    
    
    dates, bhp_closing_prices = np.loadtxt(
        '../../data/bhp.csv', delimiter=',',
        usecols=(1, 6), unpack=True,
        dtype='M8[D], f8', converters={1: dmy2ymd})
    vale_closing_prices = np.loadtxt(
        '../../data/vale.csv', delimiter=',',
        usecols=(6), unpack=True)
    bhp_returns = np.diff(
        bhp_closing_prices) / bhp_closing_prices[:-1]
    vale_returns = np.diff(
        vale_closing_prices) / vale_closing_prices[:-1]
    ave_a = bhp_returns.mean()
    dev_a = bhp_returns - ave_a
    var_a = (dev_a * dev_a).sum() / (dev_a.size - 1)
    std_a = np.sqrt(var_a)
    ave_b = vale_returns.mean()
    dev_b = vale_returns - ave_b
    var_b = (dev_b * dev_b).sum() / (dev_b.size - 1)
    std_b = np.sqrt(var_b)
    cov_ab = (dev_a * dev_b).sum() / (dev_a.size - 1)
    cov_ba = (dev_b * dev_a).sum() / (dev_b.size - 1)
    corr = np.array([
        [var_a / (std_a * std_a), cov_ab / (std_a * std_b)],
        [cov_ba / (std_b * std_a), var_b / (std_b * std_b)]])
    print(corr)
    covs = np.cov(bhp_returns, vale_returns)
    stds = np.array([
        [std_a * std_a, std_a * std_b],
        [std_b * std_a, std_b * std_b]])
    corr = covs / stds
    print(corr)
    corr = np.corrcoef(bhp_returns, vale_returns)
    print(corr)
    mp.figure('Correlation Of Returns',
              facecolor='lightgray')
    mp.title('Correlation Of Returns', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Returns', fontsize=14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(md.WeekdayLocator(
        byweekday=md.MO))
    ax.xaxis.set_minor_locator(md.DayLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter(
        '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    mp.plot(dates[:-1], bhp_returns, c='orangered',
            label='BHP')
    mp.plot(dates[:-1], vale_returns, c='dodgerblue',
            label='VALE')
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()

14.多项式拟合

  • y = p0x^n + p1x^n-1 + p2x^n-2 + ... + pn = f(x)
    y1' = f(x1) -> y1
    y2' = f(x2) -> y2
    ...
    yn' = f(xn) -> yn
    (y1-y1')^2 + (y2-y2')^2 + ... + (yn-yn')^2
    = loss (p0, ..., pn)
    p0, ..., pn = ? -> loss -> min
    X = [x1, x2, ..., xn] - 自变量
    Y = [y1, y2, ..., yn] - 实际函数值
    Y'= [y1',y2',...,yn'] - 拟合函数值
    P = [p0, p1, ..., pn] - 多项式函数中的系数
    Q = [q0, q1, ..., qn-1] - 多项式函数导函数的系数
    np.polyfit(X, Y, 最高次幂)->P
    np.polyval(P, X)->Y'
    np.polyder(P)->Q 
    y = 4x^3 + 3x^2 + 2x + 1, P=[4,3,2,1]
    dy/dx = 12x^2 + 6x + 2, Q=[12, 6, 2]
    4x^3 + 3x^2 + 2x + 1 = 0的根:np.roots(P)
    np.polysub(P1, P2)->两个多项式函数的差函数的系数
    y = 4x^3 + 3x^2 + 2x + 1, P1=[4,3,2,1]
    y = 5x^4 + x, P2=[5, 0, 0, 1, 0]
    y = -5x^4 + 4x^3 + 3x^2 + x + 1   
    np.polysub(P1, P2)->[-5, 4, 4, 1, 1]
    代码:poly.py
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md
    
    
    def dmy2ymd(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(
            dmy, '%d-%m-%Y').date()
        ymd = date.strftime('%Y-%m-%d')
        return ymd
    
    
    dates, bhp_closing_prices = np.loadtxt(
        '../../data/bhp.csv', delimiter=',',
        usecols=(1, 6), unpack=True,
        dtype='M8[D], f8', converters={1: dmy2ymd})
    vale_closing_prices = np.loadtxt(
        '../../data/vale.csv', delimiter=',',
        usecols=(6), unpack=True)
    diff_closing_prices = bhp_closing_prices - \
        vale_closing_prices
    days = dates.astype(int)
    p = np.polyfit(days, diff_closing_prices, 4)
    poly_closing_prices = np.polyval(p, days)
    q = np.polyder(p)
    roots_x = np.roots(q)
    roots_y = np.polyval(p, roots_x)
    mp.figure('Polynomial Fitting', facecolor='lightgray')
    mp.title('Polynomial Fitting', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Difference Price', fontsize=14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(md.WeekdayLocator(
        byweekday=md.MO))
    ax.xaxis.set_minor_locator(md.DayLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter(
        '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    mp.plot(dates, poly_closing_prices, c='limegreen',
            linewidth=3, label='Polynomial Fitting')
    mp.scatter(dates, diff_closing_prices, c='dodgerblue',
               alpha=0.5, s=60, label='Difference Price')
    roots_x = roots_x.astype(int).astype(
        'M8[D]').astype(md.datetime.datetime)
    mp.scatter(roots_x, roots_y, marker='^', s=80,
               c='orangered', label='Peek', zorder=4)
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()

15.符号数组

  • np.sign(数组)->符号数组
    + -> 1
    - -> -1
    0 -> 0
    np.piecewise(源数组,条件序列,取值序列)->目标数组
    针对源数组中每一个元素,检测其是否符合条件序列中的每一个条件,符合哪个条件就用取值系列中与之对应的值,表示该元素,放到目标数组中返回.
    条件序列:[a<0,a==0,a>0]
    取值序列:[-1,0,1]
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import numpy as np
    a = np.array([70, 80, 60, 30, 40])
    print(a)
    b = a - 60
    print(b)
    c = np.sign(b)
    print(c)
    d = np.piecewise(a, [a < 60, a == 60, a > 60],
                     [-1, 0, 1])
    print(d)

    obv.py
     

    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md
    
    
    def dmy2ymd(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(
            dmy, '%d-%m-%Y').date()
        ymd = date.strftime('%Y-%m-%d')
        return ymd
    
    
    dates, closing_prices, volumes = np.loadtxt(
        '../../data/bhp.csv', delimiter=',',
        usecols=(1, 6, 7), unpack=True,
        dtype='M8[D], f8, f8', converters={1: dmy2ymd})
    diff_closing_prices = np.diff(closing_prices)
    #sign_closing_prices = np.sign(diff_closing_prices)
    sign_closing_prices = np.piecewise(
        diff_closing_prices, [
            diff_closing_prices < 0,
            diff_closing_prices == 0,
            diff_closing_prices > 0], [-1, 0, 1])
    obvs = volumes[1:] * sign_closing_prices
    mp.figure('On-Balance Volume', facecolor='lightgray')
    mp.title('On-Balance Volume', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('OBV', fontsize=14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(md.WeekdayLocator(
        byweekday=md.MO))
    ax.xaxis.set_minor_locator(md.DayLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter(
        '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(axis='y', linestyle=':')
    dates = dates[1:].astype(md.datetime.datetime)
    mp.bar(dates, obvs, 1.0, color='dodgerblue',
           edgecolor='white', label='OBV')
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()

16.矢量化

  • def 标量函数(标量参数1,标量参数2,...):
        ...
        return 标量返回值1,标量返回值2,...
    矢量参数1
    矢量参数2
    ...
    numpy.vectorize(标量函数)->矢量函数
    矢量函数(矢量参数1,矢量参数2,...)
      ->矢量返回值1,矢量返回值2
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import numpy as np
    
    
    def foo(x, y):
        return x + y, x - y, x * y
    
    
    x, y = 1, 4
    print(foo(x, y))
    X, Y = np.array([1, 2, 3]), np.array([4, 5, 6])
    bar = np.vectorize(foo)
    print(bar(X, Y))
    print(np.vectorize(foo)(X, Y))

    sim.py
     

    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md
    
    
    def dmy2ymd(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(
            dmy, '%d-%m-%Y').date()
        ymd = date.strftime('%Y-%m-%d')
        return ymd
    
    
    dates, opening_prices, highest_prices, \
        lowest_prices, closing_prices = np.loadtxt(
            'bhp.csv', delimiter=',',
            usecols=(1, 3, 4, 5, 6), unpack=True,
            dtype='M8[D], f8, f8, f8, f8',
            converters={1: dmy2ymd})
    
    
    def profit(opening_price, highest_price,
               lowest_price, closing_price):
        buying_price = opening_price * 0.99
        if lowest_price <= buying_price <= highest_price:
            return (closing_price - buying_price) * \
                100 / buying_price
        return np.nan  # 无效值
    
    
    profits = np.vectorize(profit)(
        opening_prices, highest_prices, lowest_prices,
        closing_prices)
    nan = np.isnan(profits)
    dates, profits = dates[~nan], profits[~nan]
    gain_dates, gain_profits = dates[profits > 0], \
        profits[profits > 0]
    loss_dates, loss_profits = dates[profits < 0], \
        profits[profits < 0]
    mp.figure('Trading Simulation', facecolor='lightgray')
    mp.title('Trading Simulation', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Profit', fontsize=14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(md.WeekdayLocator(
        byweekday=md.MO))
    ax.xaxis.set_minor_locator(md.DayLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter(
        '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    if dates.size > 0:
        dates = dates.astype(md.datetime.datetime)
        mp.plot(dates, profits, c='gray',
                label='Profit')
        mp.axhline(y=profits.mean(), linestyle='--',
                   color='gray')
    if gain_dates.size > 0:
        gain_dates = gain_dates.astype(md.datetime.datetime)
        mp.plot(gain_dates, gain_profits, 'o',
                c='orangered', label='Gain Profit')
        mp.axhline(y=gain_profits.mean(), linestyle='--',
                   color='orangered')
    if loss_dates.size > 0:
        loss_dates = loss_dates.astype(md.datetime.datetime)
        mp.plot(loss_dates, loss_profits, 'o',
                c='limegreen', label='Loss Profit')
        mp.axhline(y=loss_profits.mean(), linestyle='--',
                   color='limegreen')
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()

17.数据平滑

  • 降噪 + 拟合 + 特征识别
    卷积  多项式   数学方法
    y1 = f(x1)
    y1 = g(x1)
    f(x1) = g(x1)
    f(x1) - g(x1) = 0
    f(x) - g(x) = 0
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md
    
    
    def dmy2ymd(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(
            dmy, '%d-%m-%Y').date()
        ymd = date.strftime('%Y-%m-%d')
        return ymd
    
    
    dates, bhp_closing_prices = np.loadtxt(
        '../../data/bhp.csv', delimiter=',',
        usecols=(1, 6), unpack=True,
        dtype='M8[D], f8', converters={1: dmy2ymd})
    vale_closing_prices = np.loadtxt(
        '../../data/vale.csv', delimiter=',',
        usecols=(6), unpack=True)
    bhp_returns = np.diff(
        bhp_closing_prices) / bhp_closing_prices[:-1]
    vale_returns = np.diff(
        vale_closing_prices) / vale_closing_prices[:-1]
    N = 8
    weights = np.hanning(N)
    weights /= weights.sum()
    bhp_smooth_returns = np.convolve(
        bhp_returns, weights, 'valid')
    vale_smooth_returns = np.convolve(
        vale_returns, weights, 'valid')
    days = dates[N - 1:-1].astype(int)
    degree = 3
    bhp_p = np.polyfit(days, bhp_smooth_returns, degree)
    bhp_fitted_returns = np.polyval(bhp_p, days)
    vale_p = np.polyfit(days, vale_smooth_returns, degree)
    vale_fitted_returns = np.polyval(vale_p, days)
    sub_p = np.polysub(bhp_p, vale_p)
    roots_x = np.roots(sub_p)
    roots_x = roots_x.compress(
        (days[0] <= roots_x) & (roots_x <= days[-1]))
    roots_y = np.polyval(bhp_p, roots_x)
    mp.figure('Smoothing Returns', facecolor='lightgray')
    mp.title('Smoothing Returns', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Returns', fontsize=14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(md.WeekdayLocator(
        byweekday=md.MO))
    ax.xaxis.set_minor_locator(md.DayLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter(
        '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    mp.plot(dates[:-1], bhp_returns, c='orangered',
            alpha=0.25, label='BHP')
    mp.plot(dates[:-1], vale_returns, c='dodgerblue',
            alpha=0.25, label='VALE')
    mp.plot(dates[N - 1:-1], bhp_smooth_returns,
            c='orangered', alpha=0.75,
            label='Smooth BHP')
    mp.plot(dates[N - 1:-1], vale_smooth_returns,
            c='dodgerblue', alpha=0.75,
            label='Smooth VALE')
    mp.plot(dates[N - 1:-1], bhp_fitted_returns,
            c='orangered', linewidth=3,
            label='Fitted BHP')
    mp.plot(dates[N - 1:-1], vale_fitted_returns,
            c='dodgerblue', linewidth=3,
            label='Fitted VALE')
    roots_x = roots_x.astype(int).astype(
        'M8[D]').astype(md.datetime.datetime)
    mp.scatter(roots_x, roots_y, s=100, marker='x',
               c='firebrick', lw=3, zorder=3)
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()


 

猜你喜欢

转载自blog.csdn.net/qq_42584444/article/details/83857118