Analiza razstave Obrazi

In [67]:
# Importi
import csv
import time
from time import mktime
import pandas as pd
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

Parsanje podatkov in nekaj ročne obdelave za lep format podatkov

In [15]:
path='/home/rok/muzej/analiza/obrazi1.csv'
path_new='/home/rok/muzej/analiza/obrazi_new.csv'
with open(path_new, 'w') as f_new:
    writer = csv.writer(f_new, delimiter=',')
    writer.writerow(['Datum', 'Skupaj'])
    with open(path, 'rb') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row['Datum'] != '' and row['Datum'] != ' ' and row['SKUPAJ'] != '':
                writer.writerow([row['Datum'], row['SKUPAJ']])


In [27]:
# V path_new je sedaj pot do datoteke za analizo
with open(path_new, 'rb') as f:
    datumi_obisk = []
    reader = csv.DictReader(f)
    for row in reader:
        datumi_obisk.append(row)

# Primer podatkov
print datumi_obisk[:3]
[{'Skupaj': '10', 'Datum': '4/5/2011'}, {'Skupaj': '2', 'Datum': '4/7/2011'}, {'Skupaj': '9', 'Datum': '4/12/2011'}]
In [28]:
df = pd.DataFrame(datumi_obisk)
In [29]:
df.head()
Out[29]:
Datum Skupaj
0 4/5/2011 10
1 4/7/2011 2
2 4/12/2011 9
3 4/14/2011 5
4 5/15/2011 3

5 rows × 2 columns

In [49]:
# Metoda za pridobitev dneva
def get_day(d):
    return datetime.fromtimestamp(mktime(d)).weekday()

# Dnevi so označeni s številkami od 0 (Ponedeljek) do 6 (Nedelja)
dnevi_obisk = []
for row in datumi_obisk:
    day = get_day(time.strptime(row['Datum'], "%m/%d/%Y"))
    dnevi_obisk.append({'Dan': day, 'Skupaj': int(row['Skupaj'])})

print dnevi_obisk[:3]
[{'Dan': 1, 'Skupaj': 10}, {'Dan': 3, 'Skupaj': 2}, {'Dan': 1, 'Skupaj': 9}]
In [50]:
dnevi_df = pd.DataFrame(dnevi_obisk)
dnevi_df.head()
Out[50]:
Dan Skupaj
0 1 10
1 3 2
2 1 9
3 3 5
4 6 3

5 rows × 2 columns

In [62]:
grps = dnevi_df.groupby(by=['Dan'])['Skupaj']
In [64]:
desc = grps.agg(['sum','count','mean','median','min','max','std','var'])
print desc
      sum  count       mean  median  min  max        std          var
Dan
0      55      5  11.000000      11    0   24   8.660254    75.000000
1    1100     93  11.827957       9    0   50   9.749720    95.057036
2     784     87   9.011494       8    0   35   7.946762    63.151029
3     900     96   9.375000       8    0   47   7.435689    55.289474
4    1457     87  16.747126       8    0  552  58.713973  3447.330660
5    1135     87  13.045977      11    0  144  16.639060   276.858327
6    1150     81  14.197531      13    0   51   9.781896    95.685494

[7 rows x 8 columns]