Commit ea0dd615 authored by Ian Dennis Miller's avatar Ian Dennis Miller

basic keyword extraction

parent e80f0b8c
......@@ -10,6 +10,7 @@ clean:
requirements:
pip install -r requirements.txt
python -c 'import nltk; nltk.download("stopwords")'
python -c 'import nltk; nltk.download("punkt")'
test:
mkdir -p ./build
......
......@@ -61,6 +61,7 @@
</div>
<script>
var keywords = {{ keywords }};
{% include 'js/swiper.js' %}
{% include 'js/pager.js' %}
{% include 'js/search.js' %}
......
......@@ -3,12 +3,15 @@
import re
import yaml
import nltk
import json
import rdflib
import base64
import hashlib
import urllib.parse
from pydenticon import Generator
from jinja2 import Environment, PackageLoader, select_autoescape
from nltk.collocations import BigramCollocationFinder
from .__meta__ import __version__
......@@ -52,6 +55,7 @@ class VisFact:
def build_cards(self):
buf = ""
content = ""
quote_tmpl = self.env.get_template('card.html.j2')
......@@ -69,6 +73,8 @@ class VisFact:
if m:
fact_id = "claim-{0:0>2}-{1:0>4}".format(m.group(1), m.group(2))
content += quote + ' '
cite = "{authors}. ({year}). {title}.".format(
authors=authors,
year=year,
......@@ -121,7 +127,18 @@ class VisFact:
for fact_key in sorted(facts.keys()):
buf += quote_tmpl.render(**facts[fact_key])
return(buf, articles)
return(buf, articles, content)
def find_keywords(self, content):
bigram_measures = nltk.collocations.BigramAssocMeasures()
tokens = nltk.word_tokenize(content)
ignored_words = nltk.corpus.stopwords.words('english')
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
finder.apply_freq_filter(2)
res = finder.nbest(bigram_measures.pmi, 10)
return([' '.join(item) for item in res])
def write_html(self):
if "rdf" in self.cfg.keys():
......@@ -132,13 +149,21 @@ class VisFact:
f.write(buf.encode())
print("visfact: wrote {0}".format(self.cfg["rdf"]))
result, articles = self.build_cards()
result, articles, content = self.build_cards()
# sort articles by value
sorted_articles = sorted(articles.items(), key=lambda x: x[1])
keywords = self.find_keywords(content)
print(keywords)
html_tmpl = self.env.get_template('main.html.j2')
with open(self.cfg["dest"], "wb") as f:
buf = html_tmpl.render(quotes=result, articles=sorted_articles, version=__version__)
buf = html_tmpl.render(
quotes=result,
keywords=json.dumps(keywords),
articles=sorted_articles,
version=__version__
)
buf = re.sub(r'file:///.+?#', '', buf)
f.write(buf.encode())
print("visfact: wrote {0}".format(self.cfg["dest"]))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment