Skip to content

Methods

The machine learning algorithm Decision Tree is used to derive rules of transitions between successive weather patterns and to generate possible sequences.

Data preparation

Date Year Month GWL(t) GWL(t+1)
1961-01-01 1961 Jan WW WW
1961-01-02 1961 Jan WW WW
1961-01-03 1961 Jan WW WZ
1961-01-04 1961 Jan WZ WZ
1961-01-05 1961 Jan WZ WZ
1961-01-06 1961 Jan WZ WZ
1961-01-07 1961 Jan WZ WZ
1961-01-08 1961 Jan WZ WZ
1961-01-09 1961 Jan WZ WZ
1961-01-10 1961 Jan WZ BM
1990-12-22 1990 Dec WA WA
1990-12-23 1990 Dec WA WZ
1990-12-24 1990 Dec WZ WZ
1990-12-25 1990 Dec WZ WZ
1990-12-26 1990 Dec WZ WZ
1990-12-27 1990 Dec WZ WZ
1990-12-28 1990 Dec WZ WZ
1990-12-29 1990 Dec WZ WZ
1990-12-30 1990 Dec WZ SWZ
1990-12-31 1990 Dec SWZ SWZ

Tab.: Daily timeseries of European weather-types (GWL). The columns Month and GWL(t) are used to estimate the target in column GWL(t+1).


Decision Tree

Training

$$\mathbf{gwl[t]=\phi_{k,m}\cdot gwl[t-1]}$$

Testing

$$\mathbf{GWL_{1000}[\tau+1]=\phi_{k,m}\cdot GWL[\tau]}$$

Tree

Screenshot

Fig.: Illustration of the decision tree.

Evaluation

Screenshot

Fig.: Comparison of the annual frequency of weather-types (GWL): observed (top) and generated (bottom).


Code

Importing

#!/usr/bin/env python
# coding: utf-8

#get_ipython().run_line_magic('matplotlib', 'notebook')
## import dependencies
from sklearn import tree #For our Decision Tree
import pandas as pd # For our DataFrame
import pydotplus # To create our Decision Tree Graph
from IPython.display import Image  # To Display a image of our graph
import numpy as N
import pylab as P
import random
import datetime
from scipy import signal,stats

P.style.use('bmh')
P.rcParams['font.family'] = 'serif'

Generating

def gwlgen(ja,je,nt):

    print (ja,je,nj)

    file = '../csv/gwlneudatum.dat' 

    jj=N.genfromtxt(file,usecols=(2),skip_header=1,dtype="I")
    mm=N.genfromtxt(file,usecols=(1),skip_header=1,dtype="I")
    dd=N.genfromtxt(file,usecols=(0),skip_header=1,dtype="I")
    gw=N.genfromtxt(file,usecols=(3),skip_header=1,dtype='unicode')

    id1 = N.where((jj>=1961)&(jj<=2019))[0]
    jd1 = N.where((jj>=ja)&(jj<=je))[0]

    jj1 = jj[id1]
    mm1 = mm[id1]
    dd1 = dd[id1]
    gw1 = gw[id1]   

    go = N.array(list(set(gw)))
    gg = N.arange(len(go))

    print (go)

    file = '../csv/03987.dat' 

    jj=N.genfromtxt(file,usecols=(2),skip_header=1,dtype="i")
    mm=N.genfromtxt(file,usecols=(1),skip_header=1,dtype="i")
    tg=N.genfromtxt(file,usecols=(4),skip_header=1,dtype="f")
    rr=N.genfromtxt(file,usecols=(13),skip_header=1,dtype="f")    

    id2 = N.where((jj>=1961)&(jj<=2019))[0]
    jd2 = N.where((jj>=ja)&(jj<=je))[0]

    jj1 = jj[id2]
    mm1 = mm[id2]
    tg1 = tg[id2]            
    rr1 = rr[id2]

    ng = 30
    nm = 12

    tmit = N.zeros((nm,ng),float)
    nied = N.zeros((nm,ng),float)

    nk = 1000

    tmits = N.zeros((nm,ng,nk),float);tmits[:,:,:] = N.nan
    nieds = N.zeros((nm,ng,nk),float);nieds[:,:,:] = N.nan

    mo = N.arange(1,13,1)

    for ig in range(ng):
        for im in range(nm):

            ind = N.where((mm1==mo[im])&(gw1==go[ig]))[0]

            if(len(ind)==0): 

                tmit[im,ig] = tmit[im-1,ig]
                nied[im,ig] = nied[im-1,ig]           

            else:                               

                #id = random.choices(ind)[0] 

                #print (id)

                #tmit[im,ig] = tg[id] 
                #nied[im,ig] = rr[id] 

                tmit[im,ig] = N.nanmean(tg1[ind])
                nied[im,ig] = N.nanmean(rr1[ind]) 

                nl = len(ind)

                tmits[im,ig,:nl] = tg1[ind]
                nieds[im,ig,:nl] = rr1[ind]

    ##############################################            

    #f = open('obs_%i-%i.txt'%(1961,2018),'w')
    #f.write('%5s%5s%5s%5s%8s%8s\n'%('ja','mo','ta','gw','tg','rr'))

    #for d in range(len(id1)):

    #    ig = N.where(go==gw1[d])[0]
    #    im = N.where(mo==mm1[d])[0]

    #    tgx = tmit[im[0],ig[0]]
    #    rrx = nied[im[0],ig[0]]

    #    f.write('%5i%5i%5i%5s%8.1f%8.1f\n'%(jj1[d],mm1[d],dd1[d],gw1[d],tgx,rrx))

    #f.close()

    ##############################                

    mon = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

    P.figure(figsize=(13,4))    

    P.subplot(121)

    PC = P.pcolor(tmit,cmap=P.get_cmap('bwr'),vmin=-5,vmax=20,edgecolors='k',linewidths=1)
    P.xticks(gg+0.5,go,rotation=45)
    P.yticks(mo-0.5,mon)
    P.title('Potsdam: Averaged Temperature per Month and GWL [mm/d]',fontsize=10,weight='bold')

    for m in mo:
        for g in gg:

            P.text(g+0.5,m-0.5,'%.1f'%tmit[m-1,g],fontsize=5,ha='center',va='center')

    P.subplot(122)

    P.pcolor(nied,cmap=P.get_cmap('YlGnBu'),vmin=1,vmax=8,edgecolors='k', linewidths=1)
    P.xticks(gg+0.5,go,rotation=45)    
    P.yticks(mo-0.5,mon)
    P.title('Potsdam: Averaged Precipitation per Month and GWL [mm/d]',fontsize=10,weight='bold')

    for m in mo:
        for g in gg:

            P.text(g+0.5,m-0.5,'%.1f'%nied[m-1,g],fontsize=5,ha='center',va='center')

    P.tight_layout()
    P.savefig('./img/matrix.png',dpi=240,transparent=False,bbox_inches='tight',pad_inches=0.0)

    ################################

    jj1 = jj[jd2]
    mm1 = mm[jd2]
    tg1 = tg[jd2]
    rr1 = rr[jd2]
    gw1 = gw[jd1]
    gw2 = gw[jd1+1]

    nd = len(jj1)

    mx1 = []

    for m in mm1:

        if(m== 1): dum = "Jan"
        if(m== 2): dum = "Feb"
        if(m== 3): dum = "Mar"
        if(m== 4): dum = "Apr"
        if(m== 5): dum = "Mai"
        if(m== 6): dum = "Jun"
        if(m== 7): dum = "Jul"
        if(m== 8): dum = "Aug"
        if(m== 9): dum = "Sep"
        if(m==10): dum = "Okt"
        if(m==11): dum = "Nov"
        if(m==12): dum = "Dez"    

        mx1.append(dum)     

    mx1 = N.array(mx1)

    # In[3]:

    df = pd.DataFrame()

    df['G0'] = gw1
    df['M0'] = mx1
    df['G1'] = gw2

    #print(df)

    df.to_html('table0.html')

    # In[4]:

    one_hot_data = pd.get_dummies(df[['G0','M0']])
#    one_hot_data = pd.get_dummies(df[['G0']])

    one_hot_data.to_html('table1.html')

    clf = tree.DecisionTreeClassifier(criterion="entropy")#"entropy")# Training the Decision Tree
    clf_train = clf.fit(one_hot_data,df[['G1']])

    # In[6]:

    keys = []

    for key in one_hot_data.columns:

        keys.append(key.split('_')[1])

    keys = N.array(keys)
    nk = len(keys)

    print (keys)

    #dot_data = tree.export_graphviz(clf_train,out_file=None,feature_names=keys,class_names=N.array(keys[0:30],str),rounded=True,filled=True,rotate=True)
    #graph = pydotplus.graph_from_dot_data(dot_data)

    #graph.write_png('./img/tree.png')
    #Image(graph.create_png())

    ###########################

    inp = N.array(one_hot_data)

    obs = []
    mod = []

    for d in range(nd): 

        #prediction = clf_train.predict([inp[d,:]])
        probabilit = clf_train.predict_proba([inp[d,:]])

        weights = probabilit

        nw = weights.shape[1]

        elements = one_hot_data.columns[0:nw]

        pred = random.choices(elements, weights[0,:],k=1)
        pred = pred[0].split('_')[1]

        obs.append(gw1[d])
        mod.append(pred)

        #print('%5s%5s%5s'%(gw1[d],gw2[d],pred))

    obs = N.array(obs)
    mod = N.array(mod) 

    ######################################

    k=datetime.date(2000+nj,1,1)-datetime.date(2000,1,1) 
    nt = k.days

    start = datetime.datetime(2000,1,1)

    init = N.zeros(nk,int)

    f = open('../csv/gen_%i-%i/gen_%i-%i.txt'%(ja,je,ja,je),'w')
    f.write('%5s%5s%5s%5s%8s%8s\n'%('ja','mo','ta','gw','tg','rr'))

    ind = N.where((keys=='WZ')|(keys=='Jan'))[0]
    init[ind]=1

    gen = []

    for t in range(nt): 

        date = start + datetime.timedelta(days=t)

        if(t>0): 

            m = date.month

            if(m==1): mm = "Jan"
            if(m==2): mm = "Feb"
            if(m==3): mm = "Mar"
            if(m==4): mm = "Apr"
            if(m==5): mm = "Mai"
            if(m==6): mm = "Jun"
            if(m==7): mm = "Jul"
            if(m==8): mm = "Aug"
            if(m==9): mm = "Sep"
            if(m==10): mm = "Okt"
            if(m==11): mm = "Nov"
            if(m==12): mm = "Dez" 

            init = N.zeros(nk,int)
            ind = N.where((keys==kk)|(keys==mm))[0]
            init[ind]=1

        probabilit = clf_train.predict_proba([init])
        weights = probabilit

        #nw = weights.shape[1]

        elements = keys[0:nw]

        pred = random.choices(elements, weights[0,:],k=1)

        kk = pred[0]

        jx = date.year
        mx = date.month
        dx = date.day

        gw = pred[0]

        ig = N.where(go==gw)[0]
        im = N.where(mo==mx)[0]

        tg = tmit[im[0],ig[0]]
        rr = nied[im[0],ig[0]]

        tgs = tmits[im[0],ig[0],:]
        rrs = nieds[im[0],ig[0],:]

        tgs = tgs[~N.isnan(tgs)]
        rrs = rrs[~N.isnan(rrs)]

        nums = list(N.arange(len(tgs)))        

        if(len(nums)>0):

            ran = random.choice(nums)

            tg = tgs[ran]
            rr = rrs[ran]

            f.write('%5i%5i%5i%5s%8.1f%8.1f\n'%(jx,mx,dx,gw,tg,rr))

        else:

            f.write('%5i%5i%5i%5s%8.1f%8.1f\n'%(jx,mx,dx,gw,tg,rr))

        #f.write('%5i%5i%5i%5s%8.1f%8.1f\n'%(jx,mx,dx,gw,tg,rr))

        gen.append(gw)

    f.close()

    gen = N.array(gen)

    # In[7]:

    fig = P.figure(figsize=(10,6))
    fig.suptitle('Weather-Type Sequence Generator using Decision Trees: Training Period (%i-%i) N=%i'%(ja,je,nj), y=1.03, fontsize=12,weight='bold')

    num_obs = []
    num_mod = []
    num_gen = []

    for key in keys[0:nk]:

        ind = N.where(obs==key)[0]
        num_obs.append(len(ind))

        ind = N.where(mod==key)[0]
        num_mod.append(len(ind))

        ind = N.where(gen==key)[0]
        num_gen.append(len(ind))    

    num_obs = N.array(num_obs)
    num_mod = N.array(num_mod)
    num_gen = N.array(num_gen)    

    ax = P.subplot(311)    

    P.bar(N.arange(nk),100.*num_obs/N.sum(num_obs),0.8,color='dodgerblue',ec='k',label='obs')
    P.xticks(N.arange(nk-12),keys[0:nk-12],rotation=45)
    P.xlim(-1,30)
    P.ylim(0,20)
    P.ylabel('Anteil [%]',weight='bold')
    P.legend(loc=2)
    ax.tick_params(direction='out')

    ax = P.subplot(312)    

    P.bar(N.arange(nk),100.*num_mod/N.sum(num_mod),0.8,color='dodgerblue',ec='k',label='mod')    
    P.xticks(N.arange(nk-12),keys[0:nk-12],rotation=45)
    P.xlim(-1,30)
    P.ylim(0,20)
    P.ylabel('Anteil [%]',weight='bold')
    P.legend(loc=2)
    ax.tick_params(direction='out')

    ax = P.subplot(313)    

    P.bar(N.arange(nk),100.*num_gen/N.sum(num_gen),0.8,color='dodgerblue',ec='k',label='gen')        
    P.xticks(N.arange(nk-12),keys[0:nk-12],rotation=45)
    P.xlim(-1,30)
    P.ylim(0,20)
    P.ylabel('Anteil [%]',weight='bold')
    P.legend(loc=2)
    ax.tick_params(direction='out')

    P.tight_layout()
    P.savefig('./img/gen_%i-%i.png'%(ja,je),dpi=240,transparent=False,bbox_inches='tight',pad_inches=0.0)

    return

Running

ja=1990;je=2019;nj=1000
gwlgen(ja,je,nj)

#ja=1990;je=2019;nj=1000
#gwlgen(ja,je,nj)