Create chart using python -
i have list of contents. each content has list of users watched content. want create chart in image below using python.
i know radius of circle proportional number of users watched content. distance between circles proportional number joint users.
so i'm interested in variant of solving problem (algorithm or existing package). also, maybe knows, how such charts called (the cloud of links?).
do have ideas how make it?
thanks responses. think useful if describe here how solve problem.
here bit of code. first, clustered data using hierarchical/kmeans clusterization. prepare simple dictionary , convert d3-style json. json used html examples on http://bl.ocks.org/mbostock/4063530.
#coding: utf-8 import argparse import json import logging import sys import numpy import pylab sklearn.cluster import kmeans common import init_db_connection numpy import array scipy.cluster import hierarchy logger = logging.getlogger("build2") clustering_type_kmeans = "kmeans" clustering_type_hierarchy = "hierarchy" def get_coord_names(cursor, limit): sql = """ select distinct unnest(user_ids) ivi_id ( select user_ids content_watched_by_users cw join content c on c.id = cw.content_id order array_length(ivi_ids, 1) desc limit %s ) t order ivi_id; """ logger.info(cursor.mogrify(sql, (limit,))) cursor.execute(sql, (limit,)) coord_names = [x[0] x in cursor] return coord_names def get_matrix_of_observations_and_objects(cursor, coords_name, limit): sql = """ select c.title, user_ids content_watched_by_users cw join content c on c.id = cw.content_id order array_length(user_ids, 1) desc limit %s""" logger.info(cursor.mogrify(sql, (limit,))) cursor.execute(sql, (limit,)) logger.info(u"Начинаем получать матрицу наблюдений и массив объектов") matrix = [] objects = [] content_id, user_ids in cursor: logger.info(u"Обрабатывается %s", content_id) objects.append((content_id, len(user_ids))) row = [0] * len(coords_name) user_id in user_ids: try: row[coords_name.index(user_id)] = 1 except valueerror: logger.error(u"Что-то не так с user_ids %s", user_ids) matrix.append(row) logger.info(u"Матрица наблюдений и массив объектов получены") return array(matrix), objects def fcluster_to_d3_dict(fcluster, objects, name_cluster=false): d = {"name": "", "children": []} in range(max(fcluster)): d["children"].append({"name": "", "children": []}) index, parent_id in enumerate(fcluster): parent = d["children"][parent_id - 1] parent["children"].append({"name": objects[index][0], "size": objects[index][1]}) if name_cluster , not parent["name"]: parent["name"] = objects[index][0] return d def code_to_d3_dict(code, objects, name_cluster=false): d = {"name": "", "children": []} in range(max(code) + 1): d["children"].append({"name": "", "children": []}) index, parent_id in enumerate(code): parent = d["children"][parent_id] parent["children"].append({"name": objects[index][0], "size": objects[index][1]}) if name_cluster , not parent["name"]: parent["name"] = objects[index][0] return d def save_to_json(result_dict, output_file="d3/flare.json"): logger.info(u"Перегоняем в json") f = open(output_file, "w") json.dump(result_dict, f) f.close() logger.info(u"json сохранен по адресу: %s", output_file) def hierarchy_clustering(matrix, objects, threshold, name_cluster): z = hierarchy.linkage(matrix, method='ward') fcluster = hierarchy.fcluster(z, threshold, 'distance') hierarchy.dendrogram(z) pylab.savefig("temp.png") logger.info(fcluster) result_dict = fcluster_to_d3_dict(fcluster, objects, name_cluster) return result_dict def kmeans_clustering(matrix, objects, k, name_cluster=false): s = 1 - (matrix / numpy.max(matrix)) db = kmeans(n_clusters=k).fit(s) logger.info(db.labels_) result_dict = code_to_d3_dict(db.labels_, objects, name_cluster) return result_dict if __name__ == "__main__": parser = argparse.argumentparser(description=u'Скрипт для получения красивого графа') # БД parser.add_argument('--db_host', default="localhost", type=str, dest="db_host", help=u'Хост БД, по умолчанию: localhost') parser.add_argument('--db_port', default="5432", type=str, dest="db_port", help=u'Порт БД, по умолчанию: 5432') parser.add_argument('--db_name', default="da_test", type=str, dest="db_name", help=u'Имя БД, по умолчанию: da') parser.add_argument('--db_user', default="da", type=str, dest="db_user", help=u'Пользователь БД, по умолчанию: da') # общее parser.add_argument("--log-level", default='info', type=str, dest="log_level", choices=['debug', 'info', 'warnings', 'error'], help=u"Уровень логирования") parser.add_argument('-l', '--limit', required=true, type=int, dest="limit", help=u'Количество контента в выборке. ' u'Контент осортирован по количеству просмотревших его пользователей') parser.add_argument('-o', '--output', required=true, type=str, dest="output_file_path", help=u'Куда сохранять json-результат') parser.add_argument('-n', '--name_cluster', action="store_true", dest="name_cluster", help=u'Именовать кластеры по первому элементу в кластере') parser.add_argument("-c", "--clustering", default=clustering_type_kmeans, type=str, dest="clustering_type", choices=[clustering_type_kmeans, clustering_type_hierarchy], help=u"Тип кластеризации") # kmeans parser.add_argument('-k', '--max_k', type=int, dest="max_k", help=u'Максимальное число кластеров. Только для kmeans') # иерархическая parser.add_argument('-t', '--threshold', type=float, dest="threshold", help=u'Граница разделения на плоские кластеры. Только для иерархической кластеризации') args = parser.parse_args() logging.basicconfig(stream=sys.stdout, level=getattr(logging, args.log_level), format="%(asctime)s :: %(message)s") connection = init_db_connection(args.db_host, args.db_port, args.db_user, args.db_name) cursor = connection.cursor() coords_name = get_coord_names(cursor, args.limit) matrix, objects = get_matrix_of_observations_and_objects(cursor, coords_name, args.limit) connection.close() if args.clustering_type == clustering_type_kmeans: result_dict = kmeans_clustering(matrix, objects, args.max_k, args.name_cluster) elif args.clustering_type == clustering_type_hierarchy: result_dict = hierarchy_clustering(matrix, objects, args.threshold, args.name_cluster) else: raise exception(u"Неизвестный тип кластеризации") save_to_json(result_dict, args.output_file_path)
result looks like:
Comments
Post a Comment