NoSQL 을 사용한 카카오톡 메시지 분석

개요

카카오톡 대화내용을 분석하여 순위를 매김

통계는 MongoDB 를 사용

목표

날짜별 대화 개수 순위
시간별 대화 개수 순위
사람별 대화 개수 순위
이모티콘 개수 순위
사진 개수 순위
단어 빈도 순위

개발

환경

Linux 서버 : MongoDB
Windows 클라이언트 : 파이썬

서버측 MongoDB 설치

docker 사용

$ docker run --name mongo -p 27017:27017 -d mongo

외부 접속을 허용하기 위해 port 연결

클라이언트측 Python 설치

konlpy 패키지 설치

단어 파싱을 위해 konlpy 패키지 설치

konlpy 는 JDK 가 필요하고, 윈도우에서 빌드된 OpenJDK 가 64bit 라 Python 도 맞춤

bit 가 맞지 않으면 jvm.dll 로드에 실패함

OpenJDK 64 bit 설치
python 64 bit 설치

pip 를 통해 konlpy 를 설치하면 알맞은 bit 의 패키지가 설치됨

의존성에 의해 JPype1 도 같이 설치됨

$ python -m pip install --upgrade pip
$ python -m pip install konlpy

pymongo 패키지 설치

MongoDB 를 사용하기 위해 pymongo 패키지 설치

$ python -m pip install pymongo

구현

카카오톡 앱에서 내보내기된 대화 내역을 Python 으로 파싱하여 DB 에 입력
DB 에서 적절한 쿼리를 통해 통계 계산

코드

import os
from datetime import datetime
from konlpy.tag import Okt
import pymongo

okt = Okt()
db_conn = None


def conn_db(address, database, collection):
    connection = pymongo.MongoClient(address)
    db = connection[database]
    coll = db[collection]
    coll.delete_many({})
    return coll


def get_datetime(msg):
    try:
        msg = msg.strip()
        idx = msg.index('일')
        sdate = msg[:idx + 1].strip()
        stime = msg[idx + 1:].strip()
        if stime[0:2] == '오전':
            return datetime.strptime(sdate + ' AM ' + stime[3:], '%Y년 %m월 %d일 %p %I:%M')
        if stime[0:2] == '오후':
            return datetime.strptime(sdate + ' PM ' + stime[3:], '%Y년 %m월 %d일 %p %I:%M')

        return None

    except ValueError:
        return None


def make_bulk_data(bulk_data, date, name, msg):
    ret = okt.pos(msg)

    data = {
        'date': date,
        'name': name,
        'msg': msg,
    }

    for i in ret:
        if i[1] == 'Foreign':
            continue
        elif i[1] not in data:
            data.update({
                i[1]: [i[0]]
            })
        else:
            data[i[1]].append(i[0])

    bulk_data.append(data)


def parse_talk(filepath, db_conn):
    f = open(filepath, 'r', encoding='utf-8')

    f.seek(0, os.SEEK_END)
    size = f.tell()
    f.seek(0, os.SEEK_SET)

    title = f.readline().strip()
    export_date = f.readline().strip()
    export_date = get_datetime(export_date[export_date.index(':') + 1:])
    start_date = None
    end_date = None

    date = None
    name = None
    msg = None
    bulk_data = list()

    progress = 0
    while True:
        line = f.readline()
        if not line:
            break
        if line in f.newlines:
            continue

        curr = int(f.tell() / size * 100)
        if curr != progress and curr <= 100:
            print('%d %%' % curr)
            progress = curr

        if len(bulk_data) > 1000:
            db_conn.insert_many(bulk_data)
            bulk_data.clear()

        try:
            idx = line.index(',')
            date = get_datetime(line[:idx])
            if date is None:
                raise ValueError

            if start_date is None:
                start_date = date
            end_date = date

            line = line[idx + 1:].strip()
            idx = line.index(':')
            name = line[:idx].strip()
            msg = line[idx + 1:].strip()

            make_bulk_data(bulk_data, date, name, msg)

        except ValueError:
            if date is None:
                continue
            if name is None:
                continue

            line = line.strip()
            if len(line) < 1:
                continue

            if get_datetime(line) is not None:
                continue

            make_bulk_data(bulk_data, date, name, line)
            continue

    f.close()

    db_conn.insert_many(bulk_data)

    print('title', title)
    print('start_date', start_date)
    print('end_date', end_date)
    print('export_date', export_date)


db_conn = conn_db('mongodb://192.168.50.152:27017', 'test', 'talk')
parse_talk('talk.txt', db_conn)

## msg count by date
print('날짜 순위')
pipeline = [
    {
        '$group': {
            '_id': {
                '$dateToString': {
                    'format': '%Y-%m-%d',
                    'date': '$date'
                }
            },
            'count': {'$sum': 1}
        }
    },
    {
        '$sort': {'count': -1}
    },
    {
        '$limit': 10
    }
]
for i in db_conn.aggregate(pipeline):
    print(i)

## msg count by hour
print('시간 순위')
pipeline = [
    {
        '$group': {
            '_id': {'$hour': '$date'},
            'count': {'$sum': 1}
        }
    },
    {
        '$sort': {'count': -1}
    }
]
for i in db_conn.aggregate(pipeline):
    print(i)

## msg count by name
print('메시지 순위')
pipeline = [
    {
        '$group': {
            '_id': '$name',
            'count': {'$sum': 1}
        }
    },
    {
        '$sort': {'count': -1}
    }
]
for i in db_conn.aggregate(pipeline):
    print(i)

## emoji count by name
print('이모티콘 순위')
pipeline = [
    {
        '$match': {'msg': '이모티콘'}
    },
    {
        '$group': {
            '_id': '$name',
            'count': {'$sum': 1}
        }
    },
    {
        '$sort': {'count': -1}
    }
]
for i in db_conn.aggregate(pipeline):
    print(i)

## pic count by name
print('사진 순위')
pipeline = [
    {
        '$match': {'msg': '사진'}
    },
    {
        '$group': {
            '_id': '$name',
            'count': {'$sum': 1}
        }
    },
    {
        '$sort': {'count': -1}
    }
]
for i in db_conn.aggregate(pipeline):
    print(i)

## word top
print('단어 순위')
pipeline = [
    {
        '$project': {
            '_id': False,
            'tags': {
                '$concatArrays': [
                    {'$ifNull': ['$Noun', []]},
                    # {'$ifNull': ['$KoreanParticle', []]},
                    # {'$ifNull': ['$Adjective', []]},
                    # {'$ifNull': [ '$Josa', [] ]},
                    # {'$ifNull': [ '$Verb', [] ]},
                    # {'$ifNull': ['$Punctuation', []]},
                    # {'$ifNull': [ '$Conjunction', [] ]},
                    # {'$ifNull': [ '$Suffix', [] ]},
                    # {'$ifNull': [ '$Adverb', [] ]},
                    # {'$ifNull': [ '$VerbPrefix', [] ]},
                    # {'$ifNull': [ '$Adjective', [] ]},
                    # {'$ifNull': [ '$Determiner', [] ]},
                    # {'$ifNull': [ '$Modifier', [] ]},
                    # {'$ifNull': [ '$Number', [] ]},
                ]
            }
        }
    },
    {
        '$unwind': {'path': '$tags'}
    },
    {
        '$group': {
            '_id': '$tags',
            'count': {'$sum': 1.0}
        }
    },
    {
        '$sort': {'count': -1}
    },
    {
        '$limit': 30
    }
]
for i in db_conn.aggregate(pipeline):
    print(i)

결과

"D:\PJT\Python PJT\PyKakaoTalkText\venv\Scripts\python.exe" "D:\PJT\JetBrains\PyCharm Community Edition 2018.3\helpers\pydev\pydevd.py" --multiproc --qt-support=auto --client 127.0.0.1 --port 59922 --file "D:/PJT/Python PJT/PyKakaoTalkText/main.py"
pydev debugger: process 31980 is connecting

Connected to pydev debugger (build 183.4284.139)
1 %
2 %
3 %
4 %
5 %
6 %
7 %
8 %
9 %
10 %
11 %
12 %
13 %
14 %
15 %
16 %
17 %
18 %
19 %
20 %
21 %
22 %
23 %
24 %
25 %
26 %
27 %
28 %
29 %
30 %
31 %
32 %
33 %
34 %
35 %
36 %
37 %
38 %
39 %
40 %
41 %
42 %
43 %
44 %
45 %
46 %
47 %
48 %
49 %
50 %
51 %
52 %
53 %
54 %
55 %
56 %
57 %
58 %
59 %
60 %
61 %
62 %
63 %
64 %
65 %
66 %
67 %
68 %
69 %
70 %
71 %
72 %
73 %
74 %
75 %
76 %
77 %
78 %
79 %
80 %
81 %
82 %
83 %
84 %
85 %
86 %
87 %
88 %
89 %
90 %
91 %
92 %
93 %
94 %
95 %
96 %
97 %
98 %
99 %
100 %
title 회사방 카카오톡 대화
start_date 2017-02-02 19:38:00
end_date 2019-02-01 14:35:00
export_date 2019-02-01 16:09:00
날짜 순위
{'_id': '2017-04-19', 'count': 982}
{'_id': '2017-08-01', 'count': 969}
{'_id': '2017-05-16', 'count': 946}
{'_id': '2017-05-02', 'count': 791}
{'_id': '2017-04-14', 'count': 576}
{'_id': '2017-04-24', 'count': 573}
{'_id': '2017-06-09', 'count': 573}
{'_id': '2018-08-28', 'count': 563}
{'_id': '2018-10-04', 'count': 556}
{'_id': '2017-05-04', 'count': 544}
시간 순위
{'_id': 16, 'count': 4479}
{'_id': 10, 'count': 3861}
{'_id': 17, 'count': 3036}
{'_id': 15, 'count': 2512}
{'_id': 9, 'count': 2403}
{'_id': 13, 'count': 2242}
{'_id': 18, 'count': 2141}
{'_id': 14, 'count': 2102}
{'_id': 11, 'count': 1805}
{'_id': 8, 'count': 1508}
{'_id': 19, 'count': 1200}
{'_id': 22, 'count': 1167}
{'_id': 12, 'count': 1006}
{'_id': 21, 'count': 927}
{'_id': 20, 'count': 744}
{'_id': 23, 'count': 643}
{'_id': 0, 'count': 580}
{'_id': 7, 'count': 345}
{'_id': 1, 'count': 122}
{'_id': 2, 'count': 54}
{'_id': 6, 'count': 18}
{'_id': 5, 'count': 13}
{'_id': 4, 'count': 7}
{'_id': 3, 'count': 5}
메시지 순위
{'_id': '배**', 'count': 5824}
{'_id': '한**', 'count': 5534}
{'_id': '고**', 'count': 5425}
{'_id': '정**', 'count': 4742}
{'_id': '회원님', 'count': 3732}
{'_id': '권**', 'count': 3490}
{'_id': '김**', 'count': 2404}
{'_id': '김**', 'count': 1769}
이모티콘 순위
{'_id': '배**', 'count': 114}
{'_id': '고**', 'count': 31}
{'_id': '김**', 'count': 19}
{'_id': '김**', 'count': 11}
{'_id': '권**', 'count': 7}
{'_id': '한**', 'count': 3}
{'_id': '회원님', 'count': 3}
사진 순위
{'_id': '배**', 'count': 119}
{'_id': '한**', 'count': 110}
{'_id': '회원님', 'count': 73}
{'_id': '고**', 'count': 72}
{'_id': '김**', 'count': 63}
{'_id': '김**', 'count': 40}
{'_id': '정**', 'count': 33}
{'_id': '권**', 'count': 26}
단어 순위
{'_id': '사진', 'count': 616.0}
{'_id': '개', 'count': 475.0}
{'_id': '회사', 'count': 352.0}
{'_id': '집', 'count': 352.0}
{'_id': '돈', 'count': 318.0}
{'_id': '그냥', 'count': 284.0}
{'_id': '대전', 'count': 269.0}
{'_id': '사람', 'count': 244.0}
{'_id': '일', 'count': 243.0}
{'_id': '오늘', 'count': 241.0}
{'_id': '슬슬', 'count': 229.0}

Process finished with exit code -1

Kim Ji Hwan

개요

목표

개발

환경

서버측 MongoDB 설치

클라이언트측 Python 설치

konlpy 패키지 설치

pymongo 패키지 설치

구현

코드

결과