Commit 212c5a69 authored by Ian Dennis Miller's avatar Ian Dennis Miller

split up counting queries, write graph straight to disk without buffer,...

split up counting queries, write graph straight to disk without buffer, summary is transformation, add to readme, make portable file with degree 2
parent a368d082
Pipeline #264 passed with stage
in 14 seconds
......@@ -5,3 +5,4 @@
/dist/
/*.egg-info/
downloader.log
downloads
USERNAME=JohnsonGregor18
USERNAME=zane88782178
# USERNAME=JohnsonGregor18
# USERNAME=iandennismiller
downloads:
rm -rf download
mkdir download
cd download && wget localhost:5000/summary.json
cd download && wget localhost:5000/community-tags.json
cd download && wget localhost:5000/descriptions.json
cd download && wget localhost:5000/description/$(USERNAME)-description.json
cd download && wget localhost:5000/name/$(USERNAME).json
cd download && wget localhost:5000/net/social-network-anonymous.graphml
cd download && wget localhost:5000/net/social-network.graphml
cd download && wget localhost:5000/followers/$(USERNAME)-followers.json
cd download && wget localhost:5000/leaders.json
cd download && wget localhost:5000/leaders/$(USERNAME)-leaders.json
query:
bin/run_query.py $(USERNAME)
......@@ -11,6 +26,9 @@ db:
rm -rf ./data/db/$(USERNAME).db/
bin/build_db.py $(USERNAME)
portable:
bin/convert_degree.py $(USERNAME)
snapshot:
bin/run_snapshot.py $(USERNAME)
......@@ -26,4 +44,4 @@ clean-locks:
requirements:
pip install -r requirements.txt
.PHONY: query install db snapshot server test clean-locks requirements
.PHONY: query install db portable snapshot server test clean-locks requirements
# twitter-save
## Overview
Set `$USERNAME` in the Makefile.
make snapshot
make portable
make db
make server
make downloads
## Usage
Set `$USERNAME` in the Makefile.
### Snapshot
Download the account and all accounts with an edge to it.
make snapshot
### Portable file
Combines all of the individual accounts (`degree = 1`) into a single Turtle file with `degree = 2`.
This new file is a portable representation of the account snapshot.
make portable
### Create database
Load a portable, `degree = 2` graph into a persistent database.
make db
### Launch API server
Start the API server, which will use the on-disk database.
make server
### Download information
Query the API to download data by-products from the snapshot.
make downloads
## Installation on Mac OS
Install Berkeley DB 4.8.
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
sys.path.insert(0, ".")
from saver.snapshot import Snapshot
def main():
if len(sys.argv) > 1:
screen_name = sys.argv[1]
else:
print("Usage: convert_degree.py [screen_name]")
sys.exit()
snapshot = Snapshot(screen_name)
snapshot.load(degree=1)
snapshot.convert_degree(2)
snapshot.save()
if __name__ == "__main__":
main()
......@@ -247,4 +247,4 @@ def summary():
['This is an account description', 'This is an account description']
"""
return jsonify(snapshot.query.summary())
return jsonify(snapshot.transformation.produce_summary())
......@@ -57,37 +57,32 @@ class Query:
results = self.snapshot.g.query(sparql)
return([edge for edge in results])
def summary(self):
# number of accounts
# number of accounts
def num_accounts(self):
sparql = self.env.get_template('num_accounts.sparql').render()
results = self.snapshot.g.query(sparql)
num_accounts = int(next(iter(results))[0])
return(int(next(iter(results))[0]))
# number of statuses
# number of statuses
def num_statuses(self):
sparql = self.env.get_template('num_statuses.sparql').render()
results = self.snapshot.g.query(sparql)
num_statuses = int(next(iter(results))[0])
return(int(next(iter(results))[0]))
# number of follows
# number of follows
def num_follows(self):
sparql = self.env.get_template('num_social_network_follows.sparql').render()
results = self.snapshot.g.query(sparql)
num_follows = int(next(iter(results))[0])
return(int(next(iter(results))[0]))
# number of leads
# number of leads
def num_leads(self):
sparql = self.env.get_template('num_social_network_leads.sparql').render()
results = self.snapshot.g.query(sparql)
num_leads = int(next(iter(results))[0])
return(int(next(iter(results))[0]))
# number of favorites
# number of favorites
def num_favorites(self):
sparql = self.env.get_template('num_favorites.sparql').render()
results = self.snapshot.g.query(sparql)
num_favorites = int(next(iter(results))[0])
return({
'num_accounts': num_accounts,
'num_statuses': num_statuses,
'num_follows': num_follows,
'num_favorites': num_favorites,
'num_leads': num_leads,
'num_edges': num_leads + num_follows,
})
return(int(next(iter(results))[0]))
......@@ -251,13 +251,11 @@ class Snapshot:
self.ttl_filename = "data/d{0}/{1}.ttl".format(self.degree, self.screen_name.lower())
def save(self):
print("Preparing to save")
self.make_paths()
buf = self.g.serialize(format='turtle')
print("Writing to {0}".format(self.ttl_filename))
self.make_paths()
with open(self.ttl_filename, 'wb') as f:
f.write(buf)
f.write(self.g.serialize(format='turtle'))
print("OK")
def load(self, degree=1):
self.degree = degree
......
......@@ -142,6 +142,22 @@ class Transformation:
tags = find_tags(phrases)
return(tags)
def produce_summary(self):
n_accounts = self.snapshot.query.num_accounts()
n_statuses = self.snapshot.query.num_statuses()
n_follows = self.snapshot.query.num_follows()
n_favorites = self.snapshot.query.num_favorites()
n_leads = self.snapshot.query.num_leads()
return({
'num_accounts': n_accounts,
'num_statuses': n_statuses,
'num_follows': n_follows,
'num_favorites': n_favorites,
'num_leads': n_leads,
'num_edges': n_leads + n_follows,
})
def produce_status_list(self, screen_name=None):
"""
Given an RDF triples graph,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment