Merge to lists in reverse chronological order using regular expression python

Question:

I am trying to merge two lists in Python in reverse chronological order using regular expression. I’m a little lost, the only thing I can do to merge them without errors so far is concatenate them together using the ‘+’ method. These are the two .txt files I am trying to merge.

file 1:

poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013 10 1 13:46:42
nohw4me "i have no idea what my cs prof is saying" 2013 10 1 12:07:14
pythondiva "My memory is great <3 64GB android" 2013 10 1 10:36:11
enigma "im so clever, my code is even unreadable to me!" 2013 10 1 09:27:00

file 2:

ocd_programmer "140 character limit? so i cant write my variable names" 2013 10 1 13:18:01
caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011 10 2 02:53:47

So far my code is

My question is how do I implement the merge_tweets() method to merge the two .txt files in reverse chronological order using regular expression?

import re
import sys

def read_tweets(file):

    records_list = []
    with open(file, 'r') as f:
        for line in f:
            match = re.search(r'@(w+) "(.*)" (d+) (d+) (d+) (d+:d+:d+)', line)
            if match:
                records_list.append({
                    'tweeter': match.group(1),
                    'tweet': match.group(2),
                    'year': int(match.group(3)),
                    'month': int(match.group(4)),
                    'day': int(match.group(5)),
                    'time': match.group(6)
                })
    return records_list

def merge_tweets(list1, list2):
    return list1 + list2

def write_tweets(records_list, file):
    with open(file, 'w') as f:
        for record in records_list:
            f.write(
                f'@{record["tweeter"]} "{record["tweet"]}" {record["year"]} {record["month"]} {record["day"]} {record["time"]}n')

def main():
    if len(sys.argv) != 4:
        print('Usage: python twitter_sort.py <file1> <file2> <output_file>')
        sys.exit(1)

    file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]

    print('Reading files...')
    records_list1 = read_tweets(file1)
    records_list2 = read_tweets(file2)

    if len(records_list1) > len(records_list2):
        print(f'{file1} contained the most tweets with {len(records_list1)}.')
    elif len(records_list2) > len(records_list1):
        print(f'{file2} contained the most tweets with {len(records_list2)}.')
    else:
        print(f'{file1} and {file2} both contained {len(records_list1)} tweets.')

    print('nMerging files...')
    records_list = merge_tweets(records_list1, records_list2)
    print('Files merged.')

    print('nWriting file...')
    write_tweets(records_list, output_file)
    print('File written.')
Asked By: tyrantGorilla

||

Answers:

I added some helper functions. A few comments:

  • read_files() does the merging, so I eliminated merge_tweets()
  • datetime is helpful when handling timestamps, and formatted timestamps are written to file (you can remanipulate them insiderecord["timestamp"] in write_tweets(), and write again in your own format)
  • these functions pass lists stored in memory, so be careful if you have many tweets, in that case use iterators, which are memory efficient. I passed lists because your functions do so.
import re
import sys
from datetime import datetime


def read_files(file1, file2):
    records_list, file_lengths = [], []
    for file in (file1, file2):
        count = 0  # I avoided enumerate() to avoid exceptions from empty files
        with open(file, 'r') as f:
            for line in f:
                records_list.append(read_tweet(line))
                count += 1
        file_lengths.append(count)
    print('Files merged.')
    return records_list, file_lengths


def read_tweet(line: str):
    match = re.search(r'(w+) "(.*)" (d+) (d+) (d+) (d+):(d+):(d+)', line)
    if match:
        return {
            'tweeter': match.group(1),
            'tweet': match.group(2),
            'timestamp': datetime(
                year=int(match.group(3)),
                month=int(match.group(4)),
                day=int(match.group(5)),
                hour=int(match.group(6)),
                minute=int(match.group(7)),
                second=int(match.group(8)),
            ),
        }


def sort_tweets(records_list):
    return sorted(records_list, key=lambda x: x["timestamp"], reverse=True)


def write_tweets(records_list, file):
    with open(file, 'w') as f:
        for record in records_list:
            f.write(f'@{record["tweeter"]} "{record["tweet"]}" {record["timestamp"]}n')


def main():
    if len(sys.argv) != 4:
        print('Usage: python twitter_sort.py <file1> <file2> <output_file>')
        sys.exit(1)

    file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]

    print('Reading files...')
    records_list = read_files(file1, file2)

    records_list_values = sort_tweets(records_list[0])
    records_list1_count, records_list2_count = records_list[1]

    if records_list1_count > records_list2_count:
        print(f'{file1} contained the most tweets with {records_list1_count}.')
    elif records_list2_count > records_list1_count:
        print(f'{file2} contained the most tweets with {records_list2_count}.')
    else:
        print(f'{file1} and {file2} both contained {records_list1_count} tweets.')

    print('nWriting file...')
    write_tweets(records_list_values, output_file)
    print('File written.')


if __name__ == "__main__":
    main()

Output:

@poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013-10-01 13:46:42
@ocd_programmer "140 character limit? so i cant write my variable names" 2013-10-01 13:18:01
@nohw4me "i have no idea what my cs prof is saying" 2013-10-01 12:07:14
@pythondiva "My memory is great <3 64GB android" 2013-10-01 10:36:11
@enigma "im so clever, my code is even unreadable to me!" 2013-10-01 09:27:00
@caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011-10-02 02:53:47
Answered By: Jonathan Ciapetti
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.