Data Wrangling with Python

Fixing labels and headers using Python.

Introduction

Using Python to read in data files and clean them up.

# ----------| FIXING LABELS/HEADERS |----------

# Instructions:

# Fixing Labels/Headers – (page 155 – 156 Data Wrangling with Python).
# Create a new dictionary for each row to create a new array.

# Purpose:

# This portion of the code will read the UNICEF files and clean them up

from csv import DictReader

data = DictReader(open('mn.csv', 'rt', encoding='utf-8'))
header = DictReader(open('mn_headers.csv', 'rt', encoding='utf-8'))

dataRows = [d for d in data]
headerRows = [h for h in header]

print(dataRows[:5])
print(headerRows[:5])

newRows = []

for data_dict in dataRows:
    new_row = {}
    for dkey, dval in data_dict.items():
        for header_dict in headerRows:
            if dkey in header_dict.values():
                new_row[header_dict.get('Label')] = dval
    newRows.append(new_row)

from csv import reader

data = reader(open('mn.csv', 'rt', encoding='utf-8'))
header = reader(open('mn_headers_updated.csv', 'rt', encoding='utf-8'))

dataRows = [d for d in data]
headerRows = [h for h in header if h[0] in dataRows[0]]

print(len(headerRows))

all_short_headers = [h[0] for h in headerRows]

skipIndex = []

for header in dataRows[0]:
    if header not in all_short_headers:
        index = dataRows[0].index(header)
        skipIndex.append(index)

newData = []

for row in dataRows[1:]:
    new_row = []
    for i, d in enumerate(row):
        if i not in skipIndex:
            new_row.append(d)
    newData.append(new_row)

zippedData = []

for drow in newData:
    zippedData.append(list(zip(headerRows, drow)))

dataHeaders = []

for i, header in enumerate(dataRows[0]):
    if i not in skipIndex:
        dataHeaders.append(header)

headerMatch = zip(dataHeaders, all_short_headers)

print(headerMatch)

# ----------| DATA FORMATS READABLE |----------

# Instructions:

# Using the same dataset as the above example (mn.csv and mn-headers.csv), use the format
# method to make output human readable.

for x in enumerate(zippedData[0][:20]):
    print(x)


# ----------| DATE FORMATTING |----------

# Instructions:

# Format the dates to determine when the interview started and ended.

from datetime import datetime

startString = '{}/{}/{} {}:{}'.format(zippedData[0][8][1],
                                      zippedData[0][7][1],
                                      zippedData[0][9][1],
                                      zippedData[0][13][1],
                                      zippedData[0][14][1])

print(startString)

startTime = datetime.strptime(startString, '%m/%d/%Y %H:%M')

print(startTime)

endTime = datetime(int(zippedData[0][9][1]),
                   int(zippedData[0][8][1]),
                   int(zippedData[0][7][1]),
                   int(zippedData[0][15][1]),
                   int(zippedData[0][16][1]))

print(endTime)

duration = endTime - startTime

print(duration)