PythonMLKeyConnectors/UsersKeyConnectorsSample.py at master · fcnatra/PythonMLKeyConnectors · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from __future__ import division
from collections import Counter
from collections import defaultdict

users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }
]

print( users )

friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

for user in users:
    user["friends"] = []

for i, j in friendships:
    users[i]["friends"].append(users[j])
    users[j]["friends"].append(users[i])

interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

salaries_and_tenures = [
    (83000, 8.7), (88000, 8.1),
    (48000, 0.7), (76000, 6),
    (69000, 6.5), (76000, 7.5),
    (60000, 2.5), (83000, 10),
    (48000, 1.9), (63000, 4.2)
]

for user in users:
    friends = ( friend['name'] for friend in user['friends'] )
    print( '[' + str( user['id'] ) + '] ' + user['name'] + '\'s friends:   \t(%s)' % ', '.join(map(str, friends)))

def number_of_friends( user ):
    return len( user['friends'])

total_connections = sum( number_of_friends( user) for user in users )
number_of_users = len(users)
average_connections = total_connections / number_of_users
number_of_friends_byId = [( user['id'], number_of_friends( user ), user['name'] ) for user in users ]
number_of_friends_byId = sorted( number_of_friends_byId, key = lambda list: list[1], reverse = True )

print( 'Number of connections per user sorted by connections (desc): ' + str(number_of_friends_byId))
print( 'Average of connections: ' + str(average_connections) )

def friends_of_friend_ids(user):
    return Counter(friend_of_a_friend["id"]
        for friend in user["friends"] # for each of my friends
        for friend_of_a_friend in friend["friends"] # count *their* friends
        if not_the_same_user(user, friend_of_a_friend) # who aren't me
        and not_friends(user, friend_of_a_friend))

def not_the_same_user(user, other_user):
    return user["id"] != other_user["id"]

def not_friends(user, other_user):
    return all(not_the_same_user(friend, other_user)
        for friend in user["friends"])

print( 'Friends of friends for ' + users[3]['name'] + ': ' + str( friends_of_friend_ids( users[3] ) ) )

def data_scientists_who_like(target_interest):
    return [user_id
        for user_id, user_interest in interests
        if user_interest == target_interest]

'''build an index from interests to users'''
user_ids_by_interest = defaultdict(list)
for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

'''build an index from users to interests'''
interests_by_userId = defaultdict(list)
for user_id, interest in interests:
    interests_by_userId[user_id].append(interest)

def who_has_most_common_interest_with(user):
    return Counter( interested_userId
        for interest in interests_by_userId[user['id']]
        for interested_userId in user_ids_by_interest[interest]
        if interested_userId != user['id'])

users_interested_inML_byId = data_scientists_who_like( 'machine learning' )
print( 'Data scientists who like \'machine learning\': ' + str( users_interested_inML_byId ) )

userId = users_interested_inML_byId[0]
others_with_common_interests_with_userId = who_has_most_common_interest_with( users[userId] )
print( 'Data scientist with interest in ML in common with ' + users[userId]['name'] + ': ' + str( others_with_common_interests_with_userId ) )

print( '\nMost common interests: ')
words_and_counts = Counter( word
                        for user, interest in interests
                        for word in interest.lower().split() )
for word, count in sorted( words_and_counts.most_common(), key = lambda list: list[1], reverse = True ):
        if count > 1:
                print( word, count )

print( '\n' )
#let's get the average salary for each experience
#keys are the years (tenure) and values are the list of salaries for each tenure
salary_by_tenure = defaultdict(list)
for salary, tenure in salaries_and_tenures:
        salary_by_tenure[tenure].append(salary)
average_salary_by_tenure = {
        tenure: sum(salaries) / len(salaries)
        for tenure, salaries in salary_by_tenure.items()
}

print( average_salary_by_tenure )

#bucketing the tenures and getting the average
def tenure_bucket(tenure):
        if tenure < 2: return "less than two"
        elif tenure < 5: return "between two and five"
        else: return "more than five"

salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
        bucket = tenure_bucket(tenure)
        salary_by_tenure_bucket[bucket].append(salary)

average_salary_by_bucket = {
        tenure_bucket : sum(salaries) / len(salaries)
        for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

print ( average_salary_by_bucket )