Create chart using python -


i have list of contents. each content has list of users watched content. want create chart in image below using python.

i know radius of circle proportional number of users watched content. distance between circles proportional number joint users.

so i'm interested in variant of solving problem (algorithm or existing package). also, maybe knows, how such charts called (the cloud of links?).

enter image description here

do have ideas how make it?

thanks responses. think useful if describe here how solve problem.

here bit of code. first, clustered data using hierarchical/kmeans clusterization. prepare simple dictionary , convert d3-style json. json used html examples on http://bl.ocks.org/mbostock/4063530.

#coding: utf-8 import argparse import json import logging import sys import numpy import pylab sklearn.cluster import kmeans common import init_db_connection numpy import array scipy.cluster import hierarchy  logger = logging.getlogger("build2")  clustering_type_kmeans = "kmeans" clustering_type_hierarchy = "hierarchy"  def get_coord_names(cursor, limit):     sql = """         select distinct unnest(user_ids) ivi_id         (             select user_ids             content_watched_by_users cw                 join content c on c.id = cw.content_id             order array_length(ivi_ids, 1) desc             limit %s         ) t         order ivi_id;     """     logger.info(cursor.mogrify(sql, (limit,)))     cursor.execute(sql, (limit,))      coord_names = [x[0] x in cursor]     return coord_names   def get_matrix_of_observations_and_objects(cursor, coords_name, limit):     sql = """         select c.title, user_ids         content_watched_by_users cw             join content c on c.id = cw.content_id         order array_length(user_ids, 1) desc limit %s"""     logger.info(cursor.mogrify(sql, (limit,)))     cursor.execute(sql, (limit,))      logger.info(u"Начинаем получать матрицу наблюдений и массив объектов")     matrix = []     objects = []      content_id, user_ids in cursor:         logger.info(u"Обрабатывается %s", content_id)         objects.append((content_id, len(user_ids)))          row = [0] * len(coords_name)         user_id in user_ids:             try:                 row[coords_name.index(user_id)] = 1             except valueerror:                 logger.error(u"Что-то не так с user_ids %s", user_ids)          matrix.append(row)     logger.info(u"Матрица наблюдений и массив объектов получены")     return array(matrix), objects   def fcluster_to_d3_dict(fcluster, objects, name_cluster=false):     d = {"name": "", "children": []}     in range(max(fcluster)):         d["children"].append({"name": "", "children": []})      index, parent_id in enumerate(fcluster):         parent = d["children"][parent_id - 1]         parent["children"].append({"name": objects[index][0], "size": objects[index][1]})          if name_cluster , not parent["name"]:             parent["name"] = objects[index][0]     return d   def code_to_d3_dict(code, objects, name_cluster=false):     d = {"name": "", "children": []}     in range(max(code) + 1):         d["children"].append({"name": "", "children": []})      index, parent_id in enumerate(code):         parent = d["children"][parent_id]         parent["children"].append({"name": objects[index][0], "size": objects[index][1]})          if name_cluster , not parent["name"]:             parent["name"] = objects[index][0]     return d   def save_to_json(result_dict, output_file="d3/flare.json"):     logger.info(u"Перегоняем в json")     f = open(output_file, "w")     json.dump(result_dict, f)     f.close()     logger.info(u"json сохранен по адресу: %s", output_file)   def hierarchy_clustering(matrix, objects, threshold, name_cluster):     z = hierarchy.linkage(matrix, method='ward')     fcluster = hierarchy.fcluster(z, threshold, 'distance')      hierarchy.dendrogram(z)     pylab.savefig("temp.png")      logger.info(fcluster)     result_dict = fcluster_to_d3_dict(fcluster, objects, name_cluster)     return result_dict   def kmeans_clustering(matrix, objects, k, name_cluster=false):     s = 1 - (matrix / numpy.max(matrix))     db = kmeans(n_clusters=k).fit(s)     logger.info(db.labels_)     result_dict = code_to_d3_dict(db.labels_, objects, name_cluster)     return result_dict   if __name__ == "__main__":     parser = argparse.argumentparser(description=u'Скрипт для получения красивого графа')     # БД     parser.add_argument('--db_host', default="localhost", type=str, dest="db_host",                         help=u'Хост БД, по умолчанию: localhost')     parser.add_argument('--db_port', default="5432", type=str, dest="db_port",                         help=u'Порт БД, по умолчанию: 5432')     parser.add_argument('--db_name', default="da_test", type=str, dest="db_name",                         help=u'Имя БД, по умолчанию: da')     parser.add_argument('--db_user', default="da", type=str, dest="db_user",                         help=u'Пользователь БД, по умолчанию: da')     # общее     parser.add_argument("--log-level", default='info', type=str, dest="log_level",                         choices=['debug', 'info', 'warnings', 'error'], help=u"Уровень логирования")     parser.add_argument('-l', '--limit', required=true, type=int, dest="limit",                         help=u'Количество контента в выборке. '                               u'Контент осортирован по количеству просмотревших его пользователей')     parser.add_argument('-o', '--output', required=true, type=str, dest="output_file_path",                         help=u'Куда сохранять json-результат')     parser.add_argument('-n', '--name_cluster', action="store_true", dest="name_cluster",                         help=u'Именовать кластеры по первому элементу в кластере')     parser.add_argument("-c", "--clustering", default=clustering_type_kmeans, type=str, dest="clustering_type",                         choices=[clustering_type_kmeans, clustering_type_hierarchy], help=u"Тип кластеризации")     # kmeans     parser.add_argument('-k', '--max_k', type=int, dest="max_k",                         help=u'Максимальное число кластеров. Только для kmeans')     # иерархическая     parser.add_argument('-t', '--threshold', type=float, dest="threshold",                         help=u'Граница разделения на плоские кластеры. Только для иерархической кластеризации')      args = parser.parse_args()     logging.basicconfig(stream=sys.stdout, level=getattr(logging, args.log_level), format="%(asctime)s :: %(message)s")      connection = init_db_connection(args.db_host, args.db_port, args.db_user, args.db_name)     cursor = connection.cursor()     coords_name = get_coord_names(cursor, args.limit)     matrix, objects = get_matrix_of_observations_and_objects(cursor, coords_name, args.limit)     connection.close()      if args.clustering_type == clustering_type_kmeans:         result_dict = kmeans_clustering(matrix, objects, args.max_k, args.name_cluster)     elif args.clustering_type == clustering_type_hierarchy:         result_dict = hierarchy_clustering(matrix, objects, args.threshold, args.name_cluster)     else:         raise exception(u"Неизвестный тип кластеризации")     save_to_json(result_dict, args.output_file_path) 

result looks like:

kmeans clustering


Comments

Popular posts from this blog

html5 - What is breaking my page when printing? -

c# - must be a non-abstract type with a public parameterless constructor in redis -

ajax - PHP/JSON Login script (Twitter style) not setting sessions -