《Python for Data Analysis》
path =
'cho2/usagov_bitly_data2012-03-16-1331923249.txt'
import json
records = [json.loads(line)
for line
in open(path)]
time_zones = [
rec[
'tz']
for rec in records
if 'tz' in rec]
dict (基础用法)
def get_counts(sequence):
counts = {}
for x
in sequence:
if x
in counts:
counts[x] +=
1
else:
counts[x] =
1
return counts
def top_counts(count_dict, n = 10):
value_key_pairs = [(count,tz)
for tz, count
in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
使用标准Python库
from collections
import defaultdict
def get_counts2(sequence):
counts = defaultdict(int)
for x
in sequence:
counts[x] +=
1
return counts
from collections
import Counter
counts = Counter(time_zones)
counts.most_common(
10)
使用pandas
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
frame = DataFrame(records)
frame[
'tz'][
:10]
tz
_counts = frame['tz'].value_counts()
tz_counts[:10]